blob: aa0b4c6c53f24b63c98b041c7d15c9b4c288c2fb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001297 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1298 1 on success. */
1299static int
1300normalize_encoding(const char *encoding,
1301 char *lower,
1302 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001305 char *l;
1306 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 e = encoding;
1309 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001310 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001311 while (*e) {
1312 if (l == l_end)
1313 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001314 if (ISUPPER(*e)) {
1315 *l++ = TOLOWER(*e++);
1316 }
1317 else if (*e == '_') {
1318 *l++ = '-';
1319 e++;
1320 }
1321 else {
1322 *l++ = *e++;
1323 }
1324 }
1325 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001326 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327}
1328
1329PyObject *PyUnicode_Decode(const char *s,
1330 Py_ssize_t size,
1331 const char *encoding,
1332 const char *errors)
1333{
1334 PyObject *buffer = NULL, *unicode;
1335 Py_buffer info;
1336 char lower[11]; /* Enough for any encoding shortcut */
1337
1338 if (encoding == NULL)
1339 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001340
1341 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001342 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1343 if (strcmp(lower, "utf-8") == 0)
1344 return PyUnicode_DecodeUTF8(s, size, errors);
1345 else if ((strcmp(lower, "latin-1") == 0) ||
1346 (strcmp(lower, "iso-8859-1") == 0))
1347 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001349 else if (strcmp(lower, "mbcs") == 0)
1350 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001351#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001352 else if (strcmp(lower, "ascii") == 0)
1353 return PyUnicode_DecodeASCII(s, size, errors);
1354 else if (strcmp(lower, "utf-16") == 0)
1355 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1356 else if (strcmp(lower, "utf-32") == 0)
1357 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001361 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001362 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001363 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001364 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (buffer == NULL)
1366 goto onError;
1367 unicode = PyCodec_Decode(buffer, encoding, errors);
1368 if (unicode == NULL)
1369 goto onError;
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001372 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 Py_DECREF(unicode);
1375 goto onError;
1376 }
1377 Py_DECREF(buffer);
1378 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_XDECREF(buffer);
1382 return NULL;
1383}
1384
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1386 const char *encoding,
1387 const char *errors)
1388{
1389 PyObject *v;
1390
1391 if (!PyUnicode_Check(unicode)) {
1392 PyErr_BadArgument();
1393 goto onError;
1394 }
1395
1396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001398
1399 /* Decode via the codec registry */
1400 v = PyCodec_Decode(unicode, encoding, errors);
1401 if (v == NULL)
1402 goto onError;
1403 return v;
1404
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001406 return NULL;
1407}
1408
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 if (!PyUnicode_Check(v)) {
1428 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001429 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001430 Py_TYPE(v)->tp_name);
1431 Py_DECREF(v);
1432 goto onError;
1433 }
1434 return v;
1435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001437 return NULL;
1438}
1439
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 Py_ssize_t size,
1442 const char *encoding,
1443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 unicode = PyUnicode_FromUnicode(s, size);
1448 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1451 Py_DECREF(unicode);
1452 return v;
1453}
1454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 return v;
1474
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001476 return NULL;
1477}
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1480{
1481 if (Py_FileSystemDefaultEncoding)
1482 return PyUnicode_AsEncodedString(unicode,
1483 Py_FileSystemDefaultEncoding,
1484 "surrogateescape");
1485 else
1486 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1487 PyUnicode_GET_SIZE(unicode),
1488 "surrogateescape");
1489}
1490
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1492 const char *encoding,
1493 const char *errors)
1494{
1495 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001496 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001497
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
Fred Drakee4315f52000-05-09 19:53:39 +00001502
Tim Petersced69f82003-09-16 20:30:58 +00001503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001505
1506 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001507 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1508 if (strcmp(lower, "utf-8") == 0)
1509 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
1512 else if ((strcmp(lower, "latin-1") == 0) ||
1513 (strcmp(lower, "iso-8859-1") == 0))
1514 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001517#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001518 else if (strcmp(lower, "mbcs") == 0)
1519 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1520 PyUnicode_GET_SIZE(unicode),
1521 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001522#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001523 else if (strcmp(lower, "ascii") == 0)
1524 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1525 PyUnicode_GET_SIZE(unicode),
1526 errors);
1527 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001528 /* During bootstrap, we may need to find the encodings
1529 package, to load the file system encoding, and require the
1530 file system encoding in order to load the encodings
1531 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001532
Victor Stinner59e62db2010-05-15 13:14:32 +00001533 Break out of this dependency by assuming that the path to
1534 the encodings module is ASCII-only. XXX could try wcstombs
1535 instead, if the file system encoding is the locale's
1536 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001537 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001538 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1539 !PyThreadState_GET()->interp->codecs_initialized)
1540 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1541 PyUnicode_GET_SIZE(unicode),
1542 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543
1544 /* Encode via the codec registry */
1545 v = PyCodec_Encode(unicode, encoding, errors);
1546 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001547 return NULL;
1548
1549 /* The normal path */
1550 if (PyBytes_Check(v))
1551 return v;
1552
1553 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001554 if (PyByteArray_Check(v)) {
1555 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001556 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001557 PyOS_snprintf(msg, sizeof(msg),
1558 "encoder %s returned buffer instead of bytes",
1559 encoding);
1560 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001561 Py_DECREF(v);
1562 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001564
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001565 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1566 Py_DECREF(v);
1567 return b;
1568 }
1569
1570 PyErr_Format(PyExc_TypeError,
1571 "encoder did not return a bytes object (type=%.400s)",
1572 Py_TYPE(v)->tp_name);
1573 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001574 return NULL;
1575}
1576
1577PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1578 const char *encoding,
1579 const char *errors)
1580{
1581 PyObject *v;
1582
1583 if (!PyUnicode_Check(unicode)) {
1584 PyErr_BadArgument();
1585 goto onError;
1586 }
1587
1588 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001590
1591 /* Encode via the codec registry */
1592 v = PyCodec_Encode(unicode, encoding, errors);
1593 if (v == NULL)
1594 goto onError;
1595 if (!PyUnicode_Check(v)) {
1596 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001597 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001598 Py_TYPE(v)->tp_name);
1599 Py_DECREF(v);
1600 goto onError;
1601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001603
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 return NULL;
1606}
1607
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001608PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001609 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001610{
1611 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001612 if (v)
1613 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001614 if (errors != NULL)
1615 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001616 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001617 PyUnicode_GET_SIZE(unicode),
1618 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001619 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001620 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001621 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001622 return v;
1623}
1624
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001625PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001626PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001627 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001628 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1629}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630
Christian Heimes5894ba72007-11-04 11:43:14 +00001631PyObject*
1632PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1633{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001634 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1635 can be undefined. If it is case, decode using UTF-8. The following assumes
1636 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1637 bootstrapping process where the codecs aren't ready yet.
1638 */
1639 if (Py_FileSystemDefaultEncoding) {
1640#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001641 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001642 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001643 }
1644#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001645 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001646 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001647 }
1648#endif
1649 return PyUnicode_Decode(s, size,
1650 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001651 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001652 }
1653 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001654 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001655 }
1656}
1657
Martin v. Löwis011e8422009-05-05 04:43:17 +00001658/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001659 system encoding. The addr param must be a PyObject**.
1660 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001661
1662int
1663PyUnicode_FSConverter(PyObject* arg, void* addr)
1664{
1665 PyObject *output = NULL;
1666 Py_ssize_t size;
1667 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001668 if (arg == NULL) {
1669 Py_DECREF(*(PyObject**)addr);
1670 return 1;
1671 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001672 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001673 output = arg;
1674 Py_INCREF(output);
1675 }
1676 else {
1677 arg = PyUnicode_FromObject(arg);
1678 if (!arg)
1679 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001680 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001681 Py_DECREF(arg);
1682 if (!output)
1683 return 0;
1684 if (!PyBytes_Check(output)) {
1685 Py_DECREF(output);
1686 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1687 return 0;
1688 }
1689 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001690 size = PyBytes_GET_SIZE(output);
1691 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001692 if (size != strlen(data)) {
1693 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1694 Py_DECREF(output);
1695 return 0;
1696 }
1697 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001698 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001699}
1700
1701
Martin v. Löwis5b222132007-06-10 09:51:05 +00001702char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001703_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001704{
Christian Heimesf3863112007-11-22 07:46:41 +00001705 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001706 if (!PyUnicode_Check(unicode)) {
1707 PyErr_BadArgument();
1708 return NULL;
1709 }
Christian Heimesf3863112007-11-22 07:46:41 +00001710 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1711 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001712 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001713 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001714 *psize = PyBytes_GET_SIZE(bytes);
1715 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001716}
1717
1718char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001719_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001720{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001721 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001722}
1723
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1725{
1726 if (!PyUnicode_Check(unicode)) {
1727 PyErr_BadArgument();
1728 goto onError;
1729 }
1730 return PyUnicode_AS_UNICODE(unicode);
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 return NULL;
1734}
1735
Martin v. Löwis18e16552006-02-15 17:27:45 +00001736Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737{
1738 if (!PyUnicode_Check(unicode)) {
1739 PyErr_BadArgument();
1740 goto onError;
1741 }
1742 return PyUnicode_GET_SIZE(unicode);
1743
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return -1;
1746}
1747
Thomas Wouters78890102000-07-22 19:25:51 +00001748const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001749{
1750 return unicode_default_encoding;
1751}
1752
1753int PyUnicode_SetDefaultEncoding(const char *encoding)
1754{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001755 if (strcmp(encoding, unicode_default_encoding) != 0) {
1756 PyErr_Format(PyExc_ValueError,
1757 "Can only set default encoding to %s",
1758 unicode_default_encoding);
1759 return -1;
1760 }
Fred Drakee4315f52000-05-09 19:53:39 +00001761 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001762}
1763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764/* error handling callback helper:
1765 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001766 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 and adjust various state variables.
1768 return 0 on success, -1 on error
1769*/
1770
1771static
1772int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 const char *encoding, const char *reason,
1774 const char **input, const char **inend, Py_ssize_t *startinpos,
1775 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1776 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001778 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779
1780 PyObject *restuple = NULL;
1781 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001783 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784 Py_ssize_t requiredsize;
1785 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001787 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 int res = -1;
1790
1791 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 *errorHandler = PyCodec_LookupError(errors);
1793 if (*errorHandler == NULL)
1794 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 }
1796
1797 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1800 if (*exceptionObject == NULL)
1801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802 }
1803 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1805 goto onError;
1806 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1807 goto onError;
1808 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1809 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 }
1811
1812 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1813 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001816 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 }
1819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001821
1822 /* Copy back the bytes variables, which might have been modified by the
1823 callback */
1824 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1825 if (!inputobj)
1826 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001827 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001828 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001829 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001830 *input = PyBytes_AS_STRING(inputobj);
1831 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001832 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001833 /* we can DECREF safely, as the exception has another reference,
1834 so the object won't go away. */
1835 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001839 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1841 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843
1844 /* need more space? (at least enough for what we
1845 have+the replacement+the rest of the string (starting
1846 at the new input position), so we won't have to check space
1847 when there are no errors in the rest of the string) */
1848 repptr = PyUnicode_AS_UNICODE(repunicode);
1849 repsize = PyUnicode_GET_SIZE(repunicode);
1850 requiredsize = *outpos + repsize + insize-newpos;
1851 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 if (requiredsize<2*outsize)
1853 requiredsize = 2*outsize;
1854 if (_PyUnicode_Resize(output, requiredsize) < 0)
1855 goto onError;
1856 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 }
1858 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001859 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860 Py_UNICODE_COPY(*outptr, repptr, repsize);
1861 *outptr += repsize;
1862 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 /* we made it! */
1865 res = 0;
1866
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_XDECREF(restuple);
1869 return res;
1870}
1871
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872/* --- UTF-7 Codec -------------------------------------------------------- */
1873
Antoine Pitrou244651a2009-05-04 18:56:13 +00001874/* See RFC2152 for details. We encode conservatively and decode liberally. */
1875
1876/* Three simple macros defining base-64. */
1877
1878/* Is c a base-64 character? */
1879
1880#define IS_BASE64(c) \
1881 (((c) >= 'A' && (c) <= 'Z') || \
1882 ((c) >= 'a' && (c) <= 'z') || \
1883 ((c) >= '0' && (c) <= '9') || \
1884 (c) == '+' || (c) == '/')
1885
1886/* given that c is a base-64 character, what is its base-64 value? */
1887
1888#define FROM_BASE64(c) \
1889 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1890 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1891 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1892 (c) == '+' ? 62 : 63)
1893
1894/* What is the base-64 character of the bottom 6 bits of n? */
1895
1896#define TO_BASE64(n) \
1897 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1898
1899/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1900 * decoded as itself. We are permissive on decoding; the only ASCII
1901 * byte not decoding to itself is the + which begins a base64
1902 * string. */
1903
1904#define DECODE_DIRECT(c) \
1905 ((c) <= 127 && (c) != '+')
1906
1907/* The UTF-7 encoder treats ASCII characters differently according to
1908 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1909 * the above). See RFC2152. This array identifies these different
1910 * sets:
1911 * 0 : "Set D"
1912 * alphanumeric and '(),-./:?
1913 * 1 : "Set O"
1914 * !"#$%&*;<=>@[]^_`{|}
1915 * 2 : "whitespace"
1916 * ht nl cr sp
1917 * 3 : special (must be base64 encoded)
1918 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1919 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920
Tim Petersced69f82003-09-16 20:30:58 +00001921static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001922char utf7_category[128] = {
1923/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1924 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1925/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1926 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1927/* sp ! " # $ % & ' ( ) * + , - . / */
1928 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1929/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1930 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1931/* @ A B C D E F G H I J K L M N O */
1932 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1933/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1935/* ` a b c d e f g h i j k l m n o */
1936 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1937/* p q r s t u v w x y z { | } ~ del */
1938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939};
1940
Antoine Pitrou244651a2009-05-04 18:56:13 +00001941/* ENCODE_DIRECT: this character should be encoded as itself. The
1942 * answer depends on whether we are encoding set O as itself, and also
1943 * on whether we are encoding whitespace as itself. RFC2152 makes it
1944 * clear that the answers to these questions vary between
1945 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001946
Antoine Pitrou244651a2009-05-04 18:56:13 +00001947#define ENCODE_DIRECT(c, directO, directWS) \
1948 ((c) < 128 && (c) > 0 && \
1949 ((utf7_category[(c)] == 0) || \
1950 (directWS && (utf7_category[(c)] == 2)) || \
1951 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001953PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 Py_ssize_t size,
1955 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001957 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1958}
1959
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960/* The decoder. The only state we preserve is our read position,
1961 * i.e. how many characters we have consumed. So if we end in the
1962 * middle of a shift sequence we have to back off the read position
1963 * and the output to the beginning of the sequence, otherwise we lose
1964 * all the shift state (seen bits, number of bits seen, high
1965 * surrogate). */
1966
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001967PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001968 Py_ssize_t size,
1969 const char *errors,
1970 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001972 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t startinpos;
1974 Py_ssize_t endinpos;
1975 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001976 const char *e;
1977 PyUnicodeObject *unicode;
1978 Py_UNICODE *p;
1979 const char *errmsg = "";
1980 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001981 Py_UNICODE *shiftOutStart;
1982 unsigned int base64bits = 0;
1983 unsigned long base64buffer = 0;
1984 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 PyObject *errorHandler = NULL;
1986 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987
1988 unicode = _PyUnicode_New(size);
1989 if (!unicode)
1990 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001991 if (size == 0) {
1992 if (consumed)
1993 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001995 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996
1997 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001998 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001999 e = s + size;
2000
2001 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002004 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005
Antoine Pitrou244651a2009-05-04 18:56:13 +00002006 if (inShift) { /* in a base-64 section */
2007 if (IS_BASE64(ch)) { /* consume a base-64 character */
2008 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2009 base64bits += 6;
2010 s++;
2011 if (base64bits >= 16) {
2012 /* we have enough bits for a UTF-16 value */
2013 Py_UNICODE outCh = (Py_UNICODE)
2014 (base64buffer >> (base64bits-16));
2015 base64bits -= 16;
2016 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2017 if (surrogate) {
2018 /* expecting a second surrogate */
2019 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2020#ifdef Py_UNICODE_WIDE
2021 *p++ = (((surrogate & 0x3FF)<<10)
2022 | (outCh & 0x3FF)) + 0x10000;
2023#else
2024 *p++ = surrogate;
2025 *p++ = outCh;
2026#endif
2027 surrogate = 0;
2028 }
2029 else {
2030 surrogate = 0;
2031 errmsg = "second surrogate missing";
2032 goto utf7Error;
2033 }
2034 }
2035 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2036 /* first surrogate */
2037 surrogate = outCh;
2038 }
2039 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2040 errmsg = "unexpected second surrogate";
2041 goto utf7Error;
2042 }
2043 else {
2044 *p++ = outCh;
2045 }
2046 }
2047 }
2048 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 inShift = 0;
2050 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002051 if (surrogate) {
2052 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002053 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002054 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002055 if (base64bits > 0) { /* left-over bits */
2056 if (base64bits >= 6) {
2057 /* We've seen at least one base-64 character */
2058 errmsg = "partial character in shift sequence";
2059 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 else {
2062 /* Some bits remain; they should be zero */
2063 if (base64buffer != 0) {
2064 errmsg = "non-zero padding bits in shift sequence";
2065 goto utf7Error;
2066 }
2067 }
2068 }
2069 if (ch != '-') {
2070 /* '-' is absorbed; other terminating
2071 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 *p++ = ch;
2073 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002074 }
2075 }
2076 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 s++; /* consume '+' */
2079 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 s++;
2081 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 }
2083 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002084 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 shiftOutStart = p;
2086 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002087 }
2088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 *p++ = ch;
2091 s++;
2092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093 else {
2094 startinpos = s-starts;
2095 s++;
2096 errmsg = "unexpected special character";
2097 goto utf7Error;
2098 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002099 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002100utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 outpos = p-PyUnicode_AS_UNICODE(unicode);
2102 endinpos = s-starts;
2103 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002104 errors, &errorHandler,
2105 "utf7", errmsg,
2106 &starts, &e, &startinpos, &endinpos, &exc, &s,
2107 &unicode, &outpos, &p))
2108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002109 }
2110
Antoine Pitrou244651a2009-05-04 18:56:13 +00002111 /* end of string */
2112
2113 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2114 /* if we're in an inconsistent state, that's an error */
2115 if (surrogate ||
2116 (base64bits >= 6) ||
2117 (base64bits > 0 && base64buffer != 0)) {
2118 outpos = p-PyUnicode_AS_UNICODE(unicode);
2119 endinpos = size;
2120 if (unicode_decode_call_errorhandler(
2121 errors, &errorHandler,
2122 "utf7", "unterminated shift sequence",
2123 &starts, &e, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
2126 if (s < e)
2127 goto restart;
2128 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130
2131 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002132 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133 if (inShift) {
2134 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002135 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136 }
2137 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002138 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002140 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002142 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 goto onError;
2144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 Py_XDECREF(errorHandler);
2146 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147 return (PyObject *)unicode;
2148
Benjamin Peterson29060642009-01-31 22:14:21 +00002149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 Py_XDECREF(errorHandler);
2151 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002152 Py_DECREF(unicode);
2153 return NULL;
2154}
2155
2156
2157PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002158 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159 int base64SetO,
2160 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002163 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002165 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002166 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002167 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002168 unsigned int base64bits = 0;
2169 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002170 char * out;
2171 char * start;
2172
2173 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002174 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002176 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002177 return PyErr_NoMemory();
2178
Antoine Pitrou244651a2009-05-04 18:56:13 +00002179 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002180 if (v == NULL)
2181 return NULL;
2182
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002183 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002184 for (;i < size; ++i) {
2185 Py_UNICODE ch = s[i];
2186
Antoine Pitrou244651a2009-05-04 18:56:13 +00002187 if (inShift) {
2188 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2189 /* shifting out */
2190 if (base64bits) { /* output remaining bits */
2191 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2192 base64buffer = 0;
2193 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194 }
2195 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196 /* Characters not in the BASE64 set implicitly unshift the sequence
2197 so no '-' is required, except if the character is itself a '-' */
2198 if (IS_BASE64(ch) || ch == '-') {
2199 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201 *out++ = (char) ch;
2202 }
2203 else {
2204 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207 else { /* not in a shift sequence */
2208 if (ch == '+') {
2209 *out++ = '+';
2210 *out++ = '-';
2211 }
2212 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2213 *out++ = (char) ch;
2214 }
2215 else {
2216 *out++ = '+';
2217 inShift = 1;
2218 goto encode_char;
2219 }
2220 }
2221 continue;
2222encode_char:
2223#ifdef Py_UNICODE_WIDE
2224 if (ch >= 0x10000) {
2225 /* code first surrogate */
2226 base64bits += 16;
2227 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2228 while (base64bits >= 6) {
2229 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2230 base64bits -= 6;
2231 }
2232 /* prepare second surrogate */
2233 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2234 }
2235#endif
2236 base64bits += 16;
2237 base64buffer = (base64buffer << 16) | ch;
2238 while (base64bits >= 6) {
2239 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2240 base64bits -= 6;
2241 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002242 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002243 if (base64bits)
2244 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2245 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002247 if (_PyBytes_Resize(&v, out - start) < 0)
2248 return NULL;
2249 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250}
2251
Antoine Pitrou244651a2009-05-04 18:56:13 +00002252#undef IS_BASE64
2253#undef FROM_BASE64
2254#undef TO_BASE64
2255#undef DECODE_DIRECT
2256#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258/* --- UTF-8 Codec -------------------------------------------------------- */
2259
Tim Petersced69f82003-09-16 20:30:58 +00002260static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261char utf8_code_length[256] = {
2262 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2263 illegal prefix. see RFC 2279 for details */
2264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2272 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2276 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2277 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2278 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2279 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2280};
2281
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 Py_ssize_t size,
2284 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
Walter Dörwald69652032004-09-07 20:24:22 +00002286 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2287}
2288
Antoine Pitrouab868312009-01-10 15:40:25 +00002289/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2290#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2291
2292/* Mask to quickly check whether a C 'long' contains a
2293 non-ASCII, UTF8-encoded char. */
2294#if (SIZEOF_LONG == 8)
2295# define ASCII_CHAR_MASK 0x8080808080808080L
2296#elif (SIZEOF_LONG == 4)
2297# define ASCII_CHAR_MASK 0x80808080L
2298#else
2299# error C 'long' size should be either 4 or 8!
2300#endif
2301
Walter Dörwald69652032004-09-07 20:24:22 +00002302PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 Py_ssize_t size,
2304 const char *errors,
2305 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002309 Py_ssize_t startinpos;
2310 Py_ssize_t endinpos;
2311 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002312 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 PyUnicodeObject *unicode;
2314 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002315 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 PyObject *errorHandler = NULL;
2317 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 /* Note: size will always be longer than the resulting Unicode
2320 character count */
2321 unicode = _PyUnicode_New(size);
2322 if (!unicode)
2323 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002324 if (size == 0) {
2325 if (consumed)
2326 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329
2330 /* Unpack UTF-8 encoded data */
2331 p = unicode->str;
2332 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002333 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
2335 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002336 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
2338 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002339 /* Fast path for runs of ASCII characters. Given that common UTF-8
2340 input will consist of an overwhelming majority of ASCII
2341 characters, we try to optimize for this case by checking
2342 as many characters as a C 'long' can contain.
2343 First, check if we can do an aligned read, as most CPUs have
2344 a penalty for unaligned reads.
2345 */
2346 if (!((size_t) s & LONG_PTR_MASK)) {
2347 /* Help register allocation */
2348 register const char *_s = s;
2349 register Py_UNICODE *_p = p;
2350 while (_s < aligned_end) {
2351 /* Read a whole long at a time (either 4 or 8 bytes),
2352 and do a fast unrolled copy if it only contains ASCII
2353 characters. */
2354 unsigned long data = *(unsigned long *) _s;
2355 if (data & ASCII_CHAR_MASK)
2356 break;
2357 _p[0] = (unsigned char) _s[0];
2358 _p[1] = (unsigned char) _s[1];
2359 _p[2] = (unsigned char) _s[2];
2360 _p[3] = (unsigned char) _s[3];
2361#if (SIZEOF_LONG == 8)
2362 _p[4] = (unsigned char) _s[4];
2363 _p[5] = (unsigned char) _s[5];
2364 _p[6] = (unsigned char) _s[6];
2365 _p[7] = (unsigned char) _s[7];
2366#endif
2367 _s += SIZEOF_LONG;
2368 _p += SIZEOF_LONG;
2369 }
2370 s = _s;
2371 p = _p;
2372 if (s == e)
2373 break;
2374 ch = (unsigned char)*s;
2375 }
2376 }
2377
2378 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002379 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380 s++;
2381 continue;
2382 }
2383
2384 n = utf8_code_length[ch];
2385
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002386 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 if (consumed)
2388 break;
2389 else {
2390 errmsg = "unexpected end of data";
2391 startinpos = s-starts;
2392 endinpos = size;
2393 goto utf8Error;
2394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396
2397 switch (n) {
2398
2399 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 startinpos = s-starts;
2402 endinpos = startinpos+1;
2403 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404
2405 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002406 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 startinpos = s-starts;
2408 endinpos = startinpos+1;
2409 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410
2411 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 if ((s[1] & 0xc0) != 0x80) {
2413 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 startinpos = s-starts;
2415 endinpos = startinpos+2;
2416 goto utf8Error;
2417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002419 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 startinpos = s-starts;
2421 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002422 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 goto utf8Error;
2424 }
2425 else
2426 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 break;
2428
2429 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002430 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002431 (s[2] & 0xc0) != 0x80) {
2432 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002433 startinpos = s-starts;
2434 endinpos = startinpos+3;
2435 goto utf8Error;
2436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002438 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002439 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 startinpos = s-starts;
2441 endinpos = startinpos+3;
2442 goto utf8Error;
2443 }
2444 else
2445 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002446 break;
2447
2448 case 4:
2449 if ((s[1] & 0xc0) != 0x80 ||
2450 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 (s[3] & 0xc0) != 0x80) {
2452 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 startinpos = s-starts;
2454 endinpos = startinpos+4;
2455 goto utf8Error;
2456 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002457 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002459 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002461 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002462 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 UTF-16 */
2464 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002465 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 startinpos = s-starts;
2467 endinpos = startinpos+4;
2468 goto utf8Error;
2469 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002470#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002471 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002472#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002473 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002474
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002475 /* translate from 10000..10FFFF to 0..FFFF */
2476 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002477
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002478 /* high surrogate = top 10 bits added to D800 */
2479 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002480
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002481 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002482 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002483#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 break;
2485
2486 default:
2487 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 startinpos = s-starts;
2490 endinpos = startinpos+n;
2491 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
2493 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002495
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 utf8Error:
2497 outpos = p-PyUnicode_AS_UNICODE(unicode);
2498 if (unicode_decode_call_errorhandler(
2499 errors, &errorHandler,
2500 "utf8", errmsg,
2501 &starts, &e, &startinpos, &endinpos, &exc, &s,
2502 &unicode, &outpos, &p))
2503 goto onError;
2504 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 }
Walter Dörwald69652032004-09-07 20:24:22 +00002506 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002507 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002510 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 goto onError;
2512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002513 Py_XDECREF(errorHandler);
2514 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 return (PyObject *)unicode;
2516
Benjamin Peterson29060642009-01-31 22:14:21 +00002517 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 Py_XDECREF(errorHandler);
2519 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 Py_DECREF(unicode);
2521 return NULL;
2522}
2523
Antoine Pitrouab868312009-01-10 15:40:25 +00002524#undef ASCII_CHAR_MASK
2525
2526
Tim Peters602f7402002-04-27 18:03:26 +00002527/* Allocation strategy: if the string is short, convert into a stack buffer
2528 and allocate exactly as much space needed at the end. Else allocate the
2529 maximum possible needed (4 result bytes per Unicode character), and return
2530 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002531*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002532PyObject *
2533PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 Py_ssize_t size,
2535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536{
Tim Peters602f7402002-04-27 18:03:26 +00002537#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002538
Guido van Rossum98297ee2007-11-06 21:34:58 +00002539 Py_ssize_t i; /* index into s of next input byte */
2540 PyObject *result; /* result string object */
2541 char *p; /* next free byte in output buffer */
2542 Py_ssize_t nallocated; /* number of result bytes allocated */
2543 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002544 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002545 PyObject *errorHandler = NULL;
2546 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002547
Tim Peters602f7402002-04-27 18:03:26 +00002548 assert(s != NULL);
2549 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
Tim Peters602f7402002-04-27 18:03:26 +00002551 if (size <= MAX_SHORT_UNICHARS) {
2552 /* Write into the stack buffer; nallocated can't overflow.
2553 * At the end, we'll allocate exactly as much heap space as it
2554 * turns out we need.
2555 */
2556 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002557 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002558 p = stackbuf;
2559 }
2560 else {
2561 /* Overallocate on the heap, and give the excess back at the end. */
2562 nallocated = size * 4;
2563 if (nallocated / 4 != size) /* overflow! */
2564 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002565 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002566 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002567 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002568 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002569 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002570
Tim Peters602f7402002-04-27 18:03:26 +00002571 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002572 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002573
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002574 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002575 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002577
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002579 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002580 *p++ = (char)(0xc0 | (ch >> 6));
2581 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002582 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002583#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002584 /* Special case: check for high and low surrogate */
2585 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2586 Py_UCS4 ch2 = s[i];
2587 /* Combine the two surrogates to form a UCS4 value */
2588 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2589 i++;
2590
2591 /* Encode UCS4 Unicode ordinals */
2592 *p++ = (char)(0xf0 | (ch >> 18));
2593 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002594 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2595 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002596 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002597#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002598 Py_ssize_t newpos;
2599 PyObject *rep;
2600 Py_ssize_t repsize, k;
2601 rep = unicode_encode_call_errorhandler
2602 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2603 s, size, &exc, i-1, i, &newpos);
2604 if (!rep)
2605 goto error;
2606
2607 if (PyBytes_Check(rep))
2608 repsize = PyBytes_GET_SIZE(rep);
2609 else
2610 repsize = PyUnicode_GET_SIZE(rep);
2611
2612 if (repsize > 4) {
2613 Py_ssize_t offset;
2614
2615 if (result == NULL)
2616 offset = p - stackbuf;
2617 else
2618 offset = p - PyBytes_AS_STRING(result);
2619
2620 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2621 /* integer overflow */
2622 PyErr_NoMemory();
2623 goto error;
2624 }
2625 nallocated += repsize - 4;
2626 if (result != NULL) {
2627 if (_PyBytes_Resize(&result, nallocated) < 0)
2628 goto error;
2629 } else {
2630 result = PyBytes_FromStringAndSize(NULL, nallocated);
2631 if (result == NULL)
2632 goto error;
2633 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2634 }
2635 p = PyBytes_AS_STRING(result) + offset;
2636 }
2637
2638 if (PyBytes_Check(rep)) {
2639 char *prep = PyBytes_AS_STRING(rep);
2640 for(k = repsize; k > 0; k--)
2641 *p++ = *prep++;
2642 } else /* rep is unicode */ {
2643 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2644 Py_UNICODE c;
2645
2646 for(k=0; k<repsize; k++) {
2647 c = prep[k];
2648 if (0x80 <= c) {
2649 raise_encode_exception(&exc, "utf-8", s, size,
2650 i-1, i, "surrogates not allowed");
2651 goto error;
2652 }
2653 *p++ = (char)prep[k];
2654 }
2655 }
2656 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002657#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002658 }
Victor Stinner445a6232010-04-22 20:01:57 +00002659#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002660 } else if (ch < 0x10000) {
2661 *p++ = (char)(0xe0 | (ch >> 12));
2662 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2663 *p++ = (char)(0x80 | (ch & 0x3f));
2664 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002665 /* Encode UCS4 Unicode ordinals */
2666 *p++ = (char)(0xf0 | (ch >> 18));
2667 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2668 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2669 *p++ = (char)(0x80 | (ch & 0x3f));
2670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002672
Guido van Rossum98297ee2007-11-06 21:34:58 +00002673 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002674 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002675 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002676 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002677 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002678 }
2679 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002680 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002681 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002682 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002683 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002684 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002685 Py_XDECREF(errorHandler);
2686 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002687 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002688 error:
2689 Py_XDECREF(errorHandler);
2690 Py_XDECREF(exc);
2691 Py_XDECREF(result);
2692 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002693
Tim Peters602f7402002-04-27 18:03:26 +00002694#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695}
2696
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2698{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (!PyUnicode_Check(unicode)) {
2700 PyErr_BadArgument();
2701 return NULL;
2702 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002703 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002704 PyUnicode_GET_SIZE(unicode),
2705 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706}
2707
Walter Dörwald41980ca2007-08-16 21:55:45 +00002708/* --- UTF-32 Codec ------------------------------------------------------- */
2709
2710PyObject *
2711PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 Py_ssize_t size,
2713 const char *errors,
2714 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002715{
2716 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2717}
2718
2719PyObject *
2720PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 Py_ssize_t size,
2722 const char *errors,
2723 int *byteorder,
2724 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002725{
2726 const char *starts = s;
2727 Py_ssize_t startinpos;
2728 Py_ssize_t endinpos;
2729 Py_ssize_t outpos;
2730 PyUnicodeObject *unicode;
2731 Py_UNICODE *p;
2732#ifndef Py_UNICODE_WIDE
2733 int i, pairs;
2734#else
2735 const int pairs = 0;
2736#endif
2737 const unsigned char *q, *e;
2738 int bo = 0; /* assume native ordering by default */
2739 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740 /* Offsets from q for retrieving bytes in the right order. */
2741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742 int iorder[] = {0, 1, 2, 3};
2743#else
2744 int iorder[] = {3, 2, 1, 0};
2745#endif
2746 PyObject *errorHandler = NULL;
2747 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002748 /* On narrow builds we split characters outside the BMP into two
2749 codepoints => count how much extra space we need. */
2750#ifndef Py_UNICODE_WIDE
2751 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 if (((Py_UCS4 *)s)[i] >= 0x10000)
2753 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002754#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002755
2756 /* This might be one to much, because of a BOM */
2757 unicode = _PyUnicode_New((size+3)/4+pairs);
2758 if (!unicode)
2759 return NULL;
2760 if (size == 0)
2761 return (PyObject *)unicode;
2762
2763 /* Unpack UTF-32 encoded data */
2764 p = unicode->str;
2765 q = (unsigned char *)s;
2766 e = q + size;
2767
2768 if (byteorder)
2769 bo = *byteorder;
2770
2771 /* Check for BOM marks (U+FEFF) in the input and adjust current
2772 byte order setting accordingly. In native mode, the leading BOM
2773 mark is skipped, in all other modes, it is copied to the output
2774 stream as-is (giving a ZWNBSP character). */
2775 if (bo == 0) {
2776 if (size >= 4) {
2777 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002778 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 if (bom == 0x0000FEFF) {
2781 q += 4;
2782 bo = -1;
2783 }
2784 else if (bom == 0xFFFE0000) {
2785 q += 4;
2786 bo = 1;
2787 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002788#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 if (bom == 0x0000FEFF) {
2790 q += 4;
2791 bo = 1;
2792 }
2793 else if (bom == 0xFFFE0000) {
2794 q += 4;
2795 bo = -1;
2796 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002797#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002798 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799 }
2800
2801 if (bo == -1) {
2802 /* force LE */
2803 iorder[0] = 0;
2804 iorder[1] = 1;
2805 iorder[2] = 2;
2806 iorder[3] = 3;
2807 }
2808 else if (bo == 1) {
2809 /* force BE */
2810 iorder[0] = 3;
2811 iorder[1] = 2;
2812 iorder[2] = 1;
2813 iorder[3] = 0;
2814 }
2815
2816 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 Py_UCS4 ch;
2818 /* remaining bytes at the end? (size should be divisible by 4) */
2819 if (e-q<4) {
2820 if (consumed)
2821 break;
2822 errmsg = "truncated data";
2823 startinpos = ((const char *)q)-starts;
2824 endinpos = ((const char *)e)-starts;
2825 goto utf32Error;
2826 /* The remaining input chars are ignored if the callback
2827 chooses to skip the input */
2828 }
2829 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2830 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002831
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 if (ch >= 0x110000)
2833 {
2834 errmsg = "codepoint not in range(0x110000)";
2835 startinpos = ((const char *)q)-starts;
2836 endinpos = startinpos+4;
2837 goto utf32Error;
2838 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002839#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002840 if (ch >= 0x10000)
2841 {
2842 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2843 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2844 }
2845 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 *p++ = ch;
2848 q += 4;
2849 continue;
2850 utf32Error:
2851 outpos = p-PyUnicode_AS_UNICODE(unicode);
2852 if (unicode_decode_call_errorhandler(
2853 errors, &errorHandler,
2854 "utf32", errmsg,
2855 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2856 &unicode, &outpos, &p))
2857 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002858 }
2859
2860 if (byteorder)
2861 *byteorder = bo;
2862
2863 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002865
2866 /* Adjust length */
2867 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2868 goto onError;
2869
2870 Py_XDECREF(errorHandler);
2871 Py_XDECREF(exc);
2872 return (PyObject *)unicode;
2873
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002875 Py_DECREF(unicode);
2876 Py_XDECREF(errorHandler);
2877 Py_XDECREF(exc);
2878 return NULL;
2879}
2880
2881PyObject *
2882PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 Py_ssize_t size,
2884 const char *errors,
2885 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002886{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002887 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002888 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002889 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002890#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002891 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002892#else
2893 const int pairs = 0;
2894#endif
2895 /* Offsets from p for storing byte pairs in the right order. */
2896#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2897 int iorder[] = {0, 1, 2, 3};
2898#else
2899 int iorder[] = {3, 2, 1, 0};
2900#endif
2901
Benjamin Peterson29060642009-01-31 22:14:21 +00002902#define STORECHAR(CH) \
2903 do { \
2904 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2905 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2906 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2907 p[iorder[0]] = (CH) & 0xff; \
2908 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002909 } while(0)
2910
2911 /* In narrow builds we can output surrogate pairs as one codepoint,
2912 so we need less space. */
2913#ifndef Py_UNICODE_WIDE
2914 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2916 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2917 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002919 nsize = (size - pairs + (byteorder == 0));
2920 bytesize = nsize * 4;
2921 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002923 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002924 if (v == NULL)
2925 return NULL;
2926
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002927 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002928 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002930 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002931 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932
2933 if (byteorder == -1) {
2934 /* force LE */
2935 iorder[0] = 0;
2936 iorder[1] = 1;
2937 iorder[2] = 2;
2938 iorder[3] = 3;
2939 }
2940 else if (byteorder == 1) {
2941 /* force BE */
2942 iorder[0] = 3;
2943 iorder[1] = 2;
2944 iorder[2] = 1;
2945 iorder[3] = 0;
2946 }
2947
2948 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002950#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2952 Py_UCS4 ch2 = *s;
2953 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2954 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2955 s++;
2956 size--;
2957 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002959#endif
2960 STORECHAR(ch);
2961 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002962
2963 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002964 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002965#undef STORECHAR
2966}
2967
2968PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2969{
2970 if (!PyUnicode_Check(unicode)) {
2971 PyErr_BadArgument();
2972 return NULL;
2973 }
2974 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 PyUnicode_GET_SIZE(unicode),
2976 NULL,
2977 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002978}
2979
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980/* --- UTF-16 Codec ------------------------------------------------------- */
2981
Tim Peters772747b2001-08-09 22:21:55 +00002982PyObject *
2983PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 Py_ssize_t size,
2985 const char *errors,
2986 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987{
Walter Dörwald69652032004-09-07 20:24:22 +00002988 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2989}
2990
Antoine Pitrouab868312009-01-10 15:40:25 +00002991/* Two masks for fast checking of whether a C 'long' may contain
2992 UTF16-encoded surrogate characters. This is an efficient heuristic,
2993 assuming that non-surrogate characters with a code point >= 0x8000 are
2994 rare in most input.
2995 FAST_CHAR_MASK is used when the input is in native byte ordering,
2996 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002997*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002998#if (SIZEOF_LONG == 8)
2999# define FAST_CHAR_MASK 0x8000800080008000L
3000# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3001#elif (SIZEOF_LONG == 4)
3002# define FAST_CHAR_MASK 0x80008000L
3003# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3004#else
3005# error C 'long' size should be either 4 or 8!
3006#endif
3007
Walter Dörwald69652032004-09-07 20:24:22 +00003008PyObject *
3009PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 Py_ssize_t size,
3011 const char *errors,
3012 int *byteorder,
3013 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t startinpos;
3017 Py_ssize_t endinpos;
3018 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 PyUnicodeObject *unicode;
3020 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003021 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003022 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003023 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003024 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003025 /* Offsets from q for retrieving byte pairs in the right order. */
3026#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3027 int ihi = 1, ilo = 0;
3028#else
3029 int ihi = 0, ilo = 1;
3030#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 PyObject *errorHandler = NULL;
3032 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033
3034 /* Note: size will always be longer than the resulting Unicode
3035 character count */
3036 unicode = _PyUnicode_New(size);
3037 if (!unicode)
3038 return NULL;
3039 if (size == 0)
3040 return (PyObject *)unicode;
3041
3042 /* Unpack UTF-16 encoded data */
3043 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003044 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003045 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046
3047 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003048 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003050 /* Check for BOM marks (U+FEFF) in the input and adjust current
3051 byte order setting accordingly. In native mode, the leading BOM
3052 mark is skipped, in all other modes, it is copied to the output
3053 stream as-is (giving a ZWNBSP character). */
3054 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003055 if (size >= 2) {
3056 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003057#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 if (bom == 0xFEFF) {
3059 q += 2;
3060 bo = -1;
3061 }
3062 else if (bom == 0xFFFE) {
3063 q += 2;
3064 bo = 1;
3065 }
Tim Petersced69f82003-09-16 20:30:58 +00003066#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 if (bom == 0xFEFF) {
3068 q += 2;
3069 bo = 1;
3070 }
3071 else if (bom == 0xFFFE) {
3072 q += 2;
3073 bo = -1;
3074 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003075#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078
Tim Peters772747b2001-08-09 22:21:55 +00003079 if (bo == -1) {
3080 /* force LE */
3081 ihi = 1;
3082 ilo = 0;
3083 }
3084 else if (bo == 1) {
3085 /* force BE */
3086 ihi = 0;
3087 ilo = 1;
3088 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3090 native_ordering = ilo < ihi;
3091#else
3092 native_ordering = ilo > ihi;
3093#endif
Tim Peters772747b2001-08-09 22:21:55 +00003094
Antoine Pitrouab868312009-01-10 15:40:25 +00003095 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003096 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003098 /* First check for possible aligned read of a C 'long'. Unaligned
3099 reads are more expensive, better to defer to another iteration. */
3100 if (!((size_t) q & LONG_PTR_MASK)) {
3101 /* Fast path for runs of non-surrogate chars. */
3102 register const unsigned char *_q = q;
3103 Py_UNICODE *_p = p;
3104 if (native_ordering) {
3105 /* Native ordering is simple: as long as the input cannot
3106 possibly contain a surrogate char, do an unrolled copy
3107 of several 16-bit code points to the target object.
3108 The non-surrogate check is done on several input bytes
3109 at a time (as many as a C 'long' can contain). */
3110 while (_q < aligned_end) {
3111 unsigned long data = * (unsigned long *) _q;
3112 if (data & FAST_CHAR_MASK)
3113 break;
3114 _p[0] = ((unsigned short *) _q)[0];
3115 _p[1] = ((unsigned short *) _q)[1];
3116#if (SIZEOF_LONG == 8)
3117 _p[2] = ((unsigned short *) _q)[2];
3118 _p[3] = ((unsigned short *) _q)[3];
3119#endif
3120 _q += SIZEOF_LONG;
3121 _p += SIZEOF_LONG / 2;
3122 }
3123 }
3124 else {
3125 /* Byteswapped ordering is similar, but we must decompose
3126 the copy bytewise, and take care of zero'ing out the
3127 upper bytes if the target object is in 32-bit units
3128 (that is, in UCS-4 builds). */
3129 while (_q < aligned_end) {
3130 unsigned long data = * (unsigned long *) _q;
3131 if (data & SWAPPED_FAST_CHAR_MASK)
3132 break;
3133 /* Zero upper bytes in UCS-4 builds */
3134#if (Py_UNICODE_SIZE > 2)
3135 _p[0] = 0;
3136 _p[1] = 0;
3137#if (SIZEOF_LONG == 8)
3138 _p[2] = 0;
3139 _p[3] = 0;
3140#endif
3141#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003142 /* Issue #4916; UCS-4 builds on big endian machines must
3143 fill the two last bytes of each 4-byte unit. */
3144#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3145# define OFF 2
3146#else
3147# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003148#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003149 ((unsigned char *) _p)[OFF + 1] = _q[0];
3150 ((unsigned char *) _p)[OFF + 0] = _q[1];
3151 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3152 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3153#if (SIZEOF_LONG == 8)
3154 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3155 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3156 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3157 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3158#endif
3159#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003160 _q += SIZEOF_LONG;
3161 _p += SIZEOF_LONG / 2;
3162 }
3163 }
3164 p = _p;
3165 q = _q;
3166 if (q >= e)
3167 break;
3168 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170
Benjamin Peterson14339b62009-01-31 16:36:08 +00003171 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003172
3173 if (ch < 0xD800 || ch > 0xDFFF) {
3174 *p++ = ch;
3175 continue;
3176 }
3177
3178 /* UTF-16 code pair: */
3179 if (q > e) {
3180 errmsg = "unexpected end of data";
3181 startinpos = (((const char *)q) - 2) - starts;
3182 endinpos = ((const char *)e) + 1 - starts;
3183 goto utf16Error;
3184 }
3185 if (0xD800 <= ch && ch <= 0xDBFF) {
3186 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3187 q += 2;
3188 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003189#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003190 *p++ = ch;
3191 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003192#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003193 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003194#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 continue;
3196 }
3197 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003198 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 startinpos = (((const char *)q)-4)-starts;
3200 endinpos = startinpos+2;
3201 goto utf16Error;
3202 }
3203
Benjamin Peterson14339b62009-01-31 16:36:08 +00003204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 errmsg = "illegal encoding";
3206 startinpos = (((const char *)q)-2)-starts;
3207 endinpos = startinpos+2;
3208 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003209
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 utf16Error:
3211 outpos = p - PyUnicode_AS_UNICODE(unicode);
3212 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003213 errors,
3214 &errorHandler,
3215 "utf16", errmsg,
3216 &starts,
3217 (const char **)&e,
3218 &startinpos,
3219 &endinpos,
3220 &exc,
3221 (const char **)&q,
3222 &unicode,
3223 &outpos,
3224 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003227 /* remaining byte at the end? (size should be even) */
3228 if (e == q) {
3229 if (!consumed) {
3230 errmsg = "truncated data";
3231 startinpos = ((const char *)q) - starts;
3232 endinpos = ((const char *)e) + 1 - starts;
3233 outpos = p - PyUnicode_AS_UNICODE(unicode);
3234 if (unicode_decode_call_errorhandler(
3235 errors,
3236 &errorHandler,
3237 "utf16", errmsg,
3238 &starts,
3239 (const char **)&e,
3240 &startinpos,
3241 &endinpos,
3242 &exc,
3243 (const char **)&q,
3244 &unicode,
3245 &outpos,
3246 &p))
3247 goto onError;
3248 /* The remaining input chars are ignored if the callback
3249 chooses to skip the input */
3250 }
3251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252
3253 if (byteorder)
3254 *byteorder = bo;
3255
Walter Dörwald69652032004-09-07 20:24:22 +00003256 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003258
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003260 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 goto onError;
3262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return (PyObject *)unicode;
3266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return NULL;
3272}
3273
Antoine Pitrouab868312009-01-10 15:40:25 +00003274#undef FAST_CHAR_MASK
3275#undef SWAPPED_FAST_CHAR_MASK
3276
Tim Peters772747b2001-08-09 22:21:55 +00003277PyObject *
3278PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 Py_ssize_t size,
3280 const char *errors,
3281 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003284 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003285 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003286#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003287 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003288#else
3289 const int pairs = 0;
3290#endif
Tim Peters772747b2001-08-09 22:21:55 +00003291 /* Offsets from p for storing byte pairs in the right order. */
3292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3293 int ihi = 1, ilo = 0;
3294#else
3295 int ihi = 0, ilo = 1;
3296#endif
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298#define STORECHAR(CH) \
3299 do { \
3300 p[ihi] = ((CH) >> 8) & 0xff; \
3301 p[ilo] = (CH) & 0xff; \
3302 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003303 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003305#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003306 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 if (s[i] >= 0x10000)
3308 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003309#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003310 /* 2 * (size + pairs + (byteorder == 0)) */
3311 if (size > PY_SSIZE_T_MAX ||
3312 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003314 nsize = size + pairs + (byteorder == 0);
3315 bytesize = nsize * 2;
3316 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003318 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 if (v == NULL)
3320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003322 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003325 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003326 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003327
3328 if (byteorder == -1) {
3329 /* force LE */
3330 ihi = 1;
3331 ilo = 0;
3332 }
3333 else if (byteorder == 1) {
3334 /* force BE */
3335 ihi = 0;
3336 ilo = 1;
3337 }
3338
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003339 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 Py_UNICODE ch = *s++;
3341 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003342#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 if (ch >= 0x10000) {
3344 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3345 ch = 0xD800 | ((ch-0x10000) >> 10);
3346 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003347#endif
Tim Peters772747b2001-08-09 22:21:55 +00003348 STORECHAR(ch);
3349 if (ch2)
3350 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003351 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003352
3353 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003354 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003355#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356}
3357
3358PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3359{
3360 if (!PyUnicode_Check(unicode)) {
3361 PyErr_BadArgument();
3362 return NULL;
3363 }
3364 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003365 PyUnicode_GET_SIZE(unicode),
3366 NULL,
3367 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368}
3369
3370/* --- Unicode Escape Codec ----------------------------------------------- */
3371
Fredrik Lundh06d12682001-01-24 07:59:11 +00003372static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 Py_ssize_t size,
3376 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 Py_ssize_t startinpos;
3380 Py_ssize_t endinpos;
3381 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003386 char* message;
3387 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 PyObject *errorHandler = NULL;
3389 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003390
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 /* Escaped strings will always be longer than the resulting
3392 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 length after conversion to the true value.
3394 (but if the error callback returns a long replacement string
3395 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 v = _PyUnicode_New(size);
3397 if (v == NULL)
3398 goto onError;
3399 if (size == 0)
3400 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003404
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 while (s < end) {
3406 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003407 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409
3410 /* Non-escape characters are interpreted as Unicode ordinals */
3411 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003412 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413 continue;
3414 }
3415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 /* \ - Escapes */
3418 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003419 c = *s++;
3420 if (s > end)
3421 c = '\0'; /* Invalid after \ */
3422 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 case '\n': break;
3426 case '\\': *p++ = '\\'; break;
3427 case '\'': *p++ = '\''; break;
3428 case '\"': *p++ = '\"'; break;
3429 case 'b': *p++ = '\b'; break;
3430 case 'f': *p++ = '\014'; break; /* FF */
3431 case 't': *p++ = '\t'; break;
3432 case 'n': *p++ = '\n'; break;
3433 case 'r': *p++ = '\r'; break;
3434 case 'v': *p++ = '\013'; break; /* VT */
3435 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3436
Benjamin Peterson29060642009-01-31 22:14:21 +00003437 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 case '0': case '1': case '2': case '3':
3439 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003440 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003441 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003442 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003443 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003444 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003446 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 break;
3448
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 /* hex escapes */
3450 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003452 digits = 2;
3453 message = "truncated \\xXX escape";
3454 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003458 digits = 4;
3459 message = "truncated \\uXXXX escape";
3460 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003463 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003464 digits = 8;
3465 message = "truncated \\UXXXXXXXX escape";
3466 hexescape:
3467 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 outpos = p-PyUnicode_AS_UNICODE(v);
3469 if (s+digits>end) {
3470 endinpos = size;
3471 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003472 errors, &errorHandler,
3473 "unicodeescape", "end of string in escape sequence",
3474 &starts, &end, &startinpos, &endinpos, &exc, &s,
3475 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 goto onError;
3477 goto nextByte;
3478 }
3479 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003480 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003481 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 endinpos = (s+i+1)-starts;
3483 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 errors, &errorHandler,
3485 "unicodeescape", message,
3486 &starts, &end, &startinpos, &endinpos, &exc, &s,
3487 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003488 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003490 }
3491 chr = (chr<<4) & ~0xF;
3492 if (c >= '0' && c <= '9')
3493 chr += c - '0';
3494 else if (c >= 'a' && c <= 'f')
3495 chr += 10 + c - 'a';
3496 else
3497 chr += 10 + c - 'A';
3498 }
3499 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003500 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 /* _decoding_error will have already written into the
3502 target buffer. */
3503 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003504 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003505 /* when we get here, chr is a 32-bit unicode character */
3506 if (chr <= 0xffff)
3507 /* UCS-2 character */
3508 *p++ = (Py_UNICODE) chr;
3509 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003510 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003511 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003512#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003513 *p++ = chr;
3514#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003515 chr -= 0x10000L;
3516 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003517 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003518#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003519 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 endinpos = s-starts;
3521 outpos = p-PyUnicode_AS_UNICODE(v);
3522 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 errors, &errorHandler,
3524 "unicodeescape", "illegal Unicode character",
3525 &starts, &end, &startinpos, &endinpos, &exc, &s,
3526 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003527 goto onError;
3528 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003529 break;
3530
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003532 case 'N':
3533 message = "malformed \\N character escape";
3534 if (ucnhash_CAPI == NULL) {
3535 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003536 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003537 if (ucnhash_CAPI == NULL)
3538 goto ucnhashError;
3539 }
3540 if (*s == '{') {
3541 const char *start = s+1;
3542 /* look for the closing brace */
3543 while (*s != '}' && s < end)
3544 s++;
3545 if (s > start && s < end && *s == '}') {
3546 /* found a name. look it up in the unicode database */
3547 message = "unknown Unicode character name";
3548 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003549 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003550 goto store;
3551 }
3552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 endinpos = s-starts;
3554 outpos = p-PyUnicode_AS_UNICODE(v);
3555 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 errors, &errorHandler,
3557 "unicodeescape", message,
3558 &starts, &end, &startinpos, &endinpos, &exc, &s,
3559 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003560 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003561 break;
3562
3563 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003564 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 message = "\\ at end of string";
3566 s--;
3567 endinpos = s-starts;
3568 outpos = p-PyUnicode_AS_UNICODE(v);
3569 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003570 errors, &errorHandler,
3571 "unicodeescape", message,
3572 &starts, &end, &startinpos, &endinpos, &exc, &s,
3573 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003574 goto onError;
3575 }
3576 else {
3577 *p++ = '\\';
3578 *p++ = (unsigned char)s[-1];
3579 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003580 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003585 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003587 Py_XDECREF(errorHandler);
3588 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003590
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003592 PyErr_SetString(
3593 PyExc_UnicodeError,
3594 "\\N escapes not supported (can't load unicodedata module)"
3595 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003596 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003599 return NULL;
3600
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 Py_XDECREF(errorHandler);
3604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 return NULL;
3606}
3607
3608/* Return a Unicode-Escape string version of the Unicode object.
3609
3610 If quotes is true, the string is enclosed in u"" or u'' quotes as
3611 appropriate.
3612
3613*/
3614
Thomas Wouters477c8d52006-05-27 19:21:47 +00003615Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 Py_ssize_t size,
3617 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003618{
3619 /* like wcschr, but doesn't stop at NULL characters */
3620
3621 while (size-- > 0) {
3622 if (*s == ch)
3623 return s;
3624 s++;
3625 }
3626
3627 return NULL;
3628}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003629
Walter Dörwald79e913e2007-05-12 11:08:06 +00003630static const char *hexdigits = "0123456789abcdef";
3631
3632PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003635 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003638#ifdef Py_UNICODE_WIDE
3639 const Py_ssize_t expandsize = 10;
3640#else
3641 const Py_ssize_t expandsize = 6;
3642#endif
3643
Thomas Wouters89f507f2006-12-13 04:49:30 +00003644 /* XXX(nnorwitz): rather than over-allocating, it would be
3645 better to choose a different scheme. Perhaps scan the
3646 first N-chars of the string and allocate based on that size.
3647 */
3648 /* Initial allocation is based on the longest-possible unichr
3649 escape.
3650
3651 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3652 unichr, so in this case it's the longest unichr escape. In
3653 narrow (UTF-16) builds this is five chars per source unichr
3654 since there are two unichrs in the surrogate pair, so in narrow
3655 (UTF-16) builds it's not the longest unichr escape.
3656
3657 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3658 so in the narrow (UTF-16) build case it's the longest unichr
3659 escape.
3660 */
3661
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003662 if (size == 0)
3663 return PyBytes_FromStringAndSize(NULL, 0);
3664
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003665 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003667
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003668 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 2
3670 + expandsize*size
3671 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 if (repr == NULL)
3673 return NULL;
3674
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003675 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 while (size-- > 0) {
3678 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003679
Walter Dörwald79e913e2007-05-12 11:08:06 +00003680 /* Escape backslashes */
3681 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 *p++ = '\\';
3683 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003684 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003685 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003686
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003687#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003688 /* Map 21-bit characters to '\U00xxxxxx' */
3689 else if (ch >= 0x10000) {
3690 *p++ = '\\';
3691 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003692 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3693 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3694 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3695 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3696 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3697 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3698 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3699 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003701 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003702#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3704 else if (ch >= 0xD800 && ch < 0xDC00) {
3705 Py_UNICODE ch2;
3706 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003707
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 ch2 = *s++;
3709 size--;
3710 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3711 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3712 *p++ = '\\';
3713 *p++ = 'U';
3714 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3715 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3716 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3717 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3718 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3719 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3720 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3721 *p++ = hexdigits[ucs & 0x0000000F];
3722 continue;
3723 }
3724 /* Fall through: isolated surrogates are copied as-is */
3725 s--;
3726 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003727 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003728#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003729
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003731 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 *p++ = '\\';
3733 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003734 *p++ = hexdigits[(ch >> 12) & 0x000F];
3735 *p++ = hexdigits[(ch >> 8) & 0x000F];
3736 *p++ = hexdigits[(ch >> 4) & 0x000F];
3737 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003739
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003740 /* Map special whitespace to '\t', \n', '\r' */
3741 else if (ch == '\t') {
3742 *p++ = '\\';
3743 *p++ = 't';
3744 }
3745 else if (ch == '\n') {
3746 *p++ = '\\';
3747 *p++ = 'n';
3748 }
3749 else if (ch == '\r') {
3750 *p++ = '\\';
3751 *p++ = 'r';
3752 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003753
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003754 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003755 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003757 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003758 *p++ = hexdigits[(ch >> 4) & 0x000F];
3759 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003760 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003761
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 /* Copy everything else as-is */
3763 else
3764 *p++ = (char) ch;
3765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003767 assert(p - PyBytes_AS_STRING(repr) > 0);
3768 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3769 return NULL;
3770 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771}
3772
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003773PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003775 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 if (!PyUnicode_Check(unicode)) {
3777 PyErr_BadArgument();
3778 return NULL;
3779 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003780 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3781 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003782 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783}
3784
3785/* --- Raw Unicode Escape Codec ------------------------------------------- */
3786
3787PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 Py_ssize_t size,
3789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003792 Py_ssize_t startinpos;
3793 Py_ssize_t endinpos;
3794 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 const char *end;
3798 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 PyObject *errorHandler = NULL;
3800 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 /* Escaped strings will always be longer than the resulting
3803 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 length after conversion to the true value. (But decoding error
3805 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 v = _PyUnicode_New(size);
3807 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 end = s + size;
3813 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 unsigned char c;
3815 Py_UCS4 x;
3816 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003817 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 /* Non-escape characters are interpreted as Unicode ordinals */
3820 if (*s != '\\') {
3821 *p++ = (unsigned char)*s++;
3822 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003823 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 startinpos = s-starts;
3825
3826 /* \u-escapes are only interpreted iff the number of leading
3827 backslashes if odd */
3828 bs = s;
3829 for (;s < end;) {
3830 if (*s != '\\')
3831 break;
3832 *p++ = (unsigned char)*s++;
3833 }
3834 if (((s - bs) & 1) == 0 ||
3835 s >= end ||
3836 (*s != 'u' && *s != 'U')) {
3837 continue;
3838 }
3839 p--;
3840 count = *s=='u' ? 4 : 8;
3841 s++;
3842
3843 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3844 outpos = p-PyUnicode_AS_UNICODE(v);
3845 for (x = 0, i = 0; i < count; ++i, ++s) {
3846 c = (unsigned char)*s;
3847 if (!ISXDIGIT(c)) {
3848 endinpos = s-starts;
3849 if (unicode_decode_call_errorhandler(
3850 errors, &errorHandler,
3851 "rawunicodeescape", "truncated \\uXXXX",
3852 &starts, &end, &startinpos, &endinpos, &exc, &s,
3853 &v, &outpos, &p))
3854 goto onError;
3855 goto nextByte;
3856 }
3857 x = (x<<4) & ~0xF;
3858 if (c >= '0' && c <= '9')
3859 x += c - '0';
3860 else if (c >= 'a' && c <= 'f')
3861 x += 10 + c - 'a';
3862 else
3863 x += 10 + c - 'A';
3864 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003865 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 /* UCS-2 character */
3867 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003868 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 /* UCS-4 character. Either store directly, or as
3870 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003871#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003872 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003873#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 x -= 0x10000L;
3875 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3876 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003877#endif
3878 } else {
3879 endinpos = s-starts;
3880 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003881 if (unicode_decode_call_errorhandler(
3882 errors, &errorHandler,
3883 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 &starts, &end, &startinpos, &endinpos, &exc, &s,
3885 &v, &outpos, &p))
3886 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003888 nextByte:
3889 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003891 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 Py_XDECREF(errorHandler);
3894 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003896
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 Py_XDECREF(errorHandler);
3900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 return NULL;
3902}
3903
3904PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003905 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003907 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 char *p;
3909 char *q;
3910
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003911#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003912 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003913#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003914 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003915#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003916
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003917 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003919
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003920 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 if (repr == NULL)
3922 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003923 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003924 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003926 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 while (size-- > 0) {
3928 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003929#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 /* Map 32-bit characters to '\Uxxxxxxxx' */
3931 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003932 *p++ = '\\';
3933 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003934 *p++ = hexdigits[(ch >> 28) & 0xf];
3935 *p++ = hexdigits[(ch >> 24) & 0xf];
3936 *p++ = hexdigits[(ch >> 20) & 0xf];
3937 *p++ = hexdigits[(ch >> 16) & 0xf];
3938 *p++ = hexdigits[(ch >> 12) & 0xf];
3939 *p++ = hexdigits[(ch >> 8) & 0xf];
3940 *p++ = hexdigits[(ch >> 4) & 0xf];
3941 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003942 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003943 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003944#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3946 if (ch >= 0xD800 && ch < 0xDC00) {
3947 Py_UNICODE ch2;
3948 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003949
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 ch2 = *s++;
3951 size--;
3952 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3953 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3954 *p++ = '\\';
3955 *p++ = 'U';
3956 *p++ = hexdigits[(ucs >> 28) & 0xf];
3957 *p++ = hexdigits[(ucs >> 24) & 0xf];
3958 *p++ = hexdigits[(ucs >> 20) & 0xf];
3959 *p++ = hexdigits[(ucs >> 16) & 0xf];
3960 *p++ = hexdigits[(ucs >> 12) & 0xf];
3961 *p++ = hexdigits[(ucs >> 8) & 0xf];
3962 *p++ = hexdigits[(ucs >> 4) & 0xf];
3963 *p++ = hexdigits[ucs & 0xf];
3964 continue;
3965 }
3966 /* Fall through: isolated surrogates are copied as-is */
3967 s--;
3968 size++;
3969 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003970#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 /* Map 16-bit characters to '\uxxxx' */
3972 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 *p++ = '\\';
3974 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003975 *p++ = hexdigits[(ch >> 12) & 0xf];
3976 *p++ = hexdigits[(ch >> 8) & 0xf];
3977 *p++ = hexdigits[(ch >> 4) & 0xf];
3978 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 /* Copy everything else as-is */
3981 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 *p++ = (char) ch;
3983 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003984 size = p - q;
3985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003986 assert(size > 0);
3987 if (_PyBytes_Resize(&repr, size) < 0)
3988 return NULL;
3989 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990}
3991
3992PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3993{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003994 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003996 PyErr_BadArgument();
3997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003999 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4000 PyUnicode_GET_SIZE(unicode));
4001
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004002 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003}
4004
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004005/* --- Unicode Internal Codec ------------------------------------------- */
4006
4007PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 Py_ssize_t size,
4009 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004010{
4011 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004012 Py_ssize_t startinpos;
4013 Py_ssize_t endinpos;
4014 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004015 PyUnicodeObject *v;
4016 Py_UNICODE *p;
4017 const char *end;
4018 const char *reason;
4019 PyObject *errorHandler = NULL;
4020 PyObject *exc = NULL;
4021
Neal Norwitzd43069c2006-01-08 01:12:10 +00004022#ifdef Py_UNICODE_WIDE
4023 Py_UNICODE unimax = PyUnicode_GetMax();
4024#endif
4025
Thomas Wouters89f507f2006-12-13 04:49:30 +00004026 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004027 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4028 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004030 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004032 p = PyUnicode_AS_UNICODE(v);
4033 end = s + size;
4034
4035 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004036 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004037 /* We have to sanity check the raw data, otherwise doom looms for
4038 some malformed UCS-4 data. */
4039 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004040#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004041 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004042#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004043 end-s < Py_UNICODE_SIZE
4044 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004046 startinpos = s - starts;
4047 if (end-s < Py_UNICODE_SIZE) {
4048 endinpos = end-starts;
4049 reason = "truncated input";
4050 }
4051 else {
4052 endinpos = s - starts + Py_UNICODE_SIZE;
4053 reason = "illegal code point (> 0x10FFFF)";
4054 }
4055 outpos = p - PyUnicode_AS_UNICODE(v);
4056 if (unicode_decode_call_errorhandler(
4057 errors, &errorHandler,
4058 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004059 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004060 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004061 goto onError;
4062 }
4063 }
4064 else {
4065 p++;
4066 s += Py_UNICODE_SIZE;
4067 }
4068 }
4069
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004070 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004071 goto onError;
4072 Py_XDECREF(errorHandler);
4073 Py_XDECREF(exc);
4074 return (PyObject *)v;
4075
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004077 Py_XDECREF(v);
4078 Py_XDECREF(errorHandler);
4079 Py_XDECREF(exc);
4080 return NULL;
4081}
4082
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083/* --- Latin-1 Codec ------------------------------------------------------ */
4084
4085PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 Py_ssize_t size,
4087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088{
4089 PyUnicodeObject *v;
4090 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004091 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004092
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004094 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 Py_UNICODE r = *(unsigned char*)s;
4096 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004097 }
4098
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 v = _PyUnicode_New(size);
4100 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004105 e = s + size;
4106 /* Unrolling the copy makes it much faster by reducing the looping
4107 overhead. This is similar to what many memcpy() implementations do. */
4108 unrolled_end = e - 4;
4109 while (s < unrolled_end) {
4110 p[0] = (unsigned char) s[0];
4111 p[1] = (unsigned char) s[1];
4112 p[2] = (unsigned char) s[2];
4113 p[3] = (unsigned char) s[3];
4114 s += 4;
4115 p += 4;
4116 }
4117 while (s < e)
4118 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004120
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 Py_XDECREF(v);
4123 return NULL;
4124}
4125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126/* create or adjust a UnicodeEncodeError */
4127static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 const char *encoding,
4129 const Py_UNICODE *unicode, Py_ssize_t size,
4130 Py_ssize_t startpos, Py_ssize_t endpos,
4131 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 *exceptionObject = PyUnicodeEncodeError_Create(
4135 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
4137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4139 goto onError;
4140 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4141 goto onError;
4142 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4143 goto onError;
4144 return;
4145 onError:
4146 Py_DECREF(*exceptionObject);
4147 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 }
4149}
4150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151/* raises a UnicodeEncodeError */
4152static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 const char *encoding,
4154 const Py_UNICODE *unicode, Py_ssize_t size,
4155 Py_ssize_t startpos, Py_ssize_t endpos,
4156 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157{
4158 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162}
4163
4164/* error handling callback helper:
4165 build arguments, call the callback and check the arguments,
4166 put the result into newpos and return the replacement string, which
4167 has to be freed by the caller */
4168static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 PyObject **errorHandler,
4170 const char *encoding, const char *reason,
4171 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4172 Py_ssize_t startpos, Py_ssize_t endpos,
4173 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004175 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176
4177 PyObject *restuple;
4178 PyObject *resunicode;
4179
4180 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 }
4185
4186 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190
4191 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004196 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 Py_DECREF(restuple);
4198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004200 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 &resunicode, newpos)) {
4202 Py_DECREF(restuple);
4203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004205 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4206 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4207 Py_DECREF(restuple);
4208 return NULL;
4209 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004212 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4214 Py_DECREF(restuple);
4215 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004216 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 Py_INCREF(resunicode);
4218 Py_DECREF(restuple);
4219 return resunicode;
4220}
4221
4222static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 Py_ssize_t size,
4224 const char *errors,
4225 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226{
4227 /* output object */
4228 PyObject *res;
4229 /* pointers to the beginning and end+1 of input */
4230 const Py_UNICODE *startp = p;
4231 const Py_UNICODE *endp = p + size;
4232 /* pointer to the beginning of the unencodable characters */
4233 /* const Py_UNICODE *badp = NULL; */
4234 /* pointer into the output */
4235 char *str;
4236 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004237 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004238 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4239 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 PyObject *errorHandler = NULL;
4241 PyObject *exc = NULL;
4242 /* the following variable is used for caching string comparisons
4243 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4244 int known_errorHandler = -1;
4245
4246 /* allocate enough for a simple encoding without
4247 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004248 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004249 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004250 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004252 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004253 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 ressize = size;
4255
4256 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 /* can we encode this? */
4260 if (c<limit) {
4261 /* no overflow check, because we know that the space is enough */
4262 *str++ = (char)c;
4263 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004264 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 else {
4266 Py_ssize_t unicodepos = p-startp;
4267 Py_ssize_t requiredsize;
4268 PyObject *repunicode;
4269 Py_ssize_t repsize;
4270 Py_ssize_t newpos;
4271 Py_ssize_t respos;
4272 Py_UNICODE *uni2;
4273 /* startpos for collecting unencodable chars */
4274 const Py_UNICODE *collstart = p;
4275 const Py_UNICODE *collend = p;
4276 /* find all unecodable characters */
4277 while ((collend < endp) && ((*collend)>=limit))
4278 ++collend;
4279 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4280 if (known_errorHandler==-1) {
4281 if ((errors==NULL) || (!strcmp(errors, "strict")))
4282 known_errorHandler = 1;
4283 else if (!strcmp(errors, "replace"))
4284 known_errorHandler = 2;
4285 else if (!strcmp(errors, "ignore"))
4286 known_errorHandler = 3;
4287 else if (!strcmp(errors, "xmlcharrefreplace"))
4288 known_errorHandler = 4;
4289 else
4290 known_errorHandler = 0;
4291 }
4292 switch (known_errorHandler) {
4293 case 1: /* strict */
4294 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4295 goto onError;
4296 case 2: /* replace */
4297 while (collstart++<collend)
4298 *str++ = '?'; /* fall through */
4299 case 3: /* ignore */
4300 p = collend;
4301 break;
4302 case 4: /* xmlcharrefreplace */
4303 respos = str - PyBytes_AS_STRING(res);
4304 /* determine replacement size (temporarily (mis)uses p) */
4305 for (p = collstart, repsize = 0; p < collend; ++p) {
4306 if (*p<10)
4307 repsize += 2+1+1;
4308 else if (*p<100)
4309 repsize += 2+2+1;
4310 else if (*p<1000)
4311 repsize += 2+3+1;
4312 else if (*p<10000)
4313 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004314#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 else
4316 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004317#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 else if (*p<100000)
4319 repsize += 2+5+1;
4320 else if (*p<1000000)
4321 repsize += 2+6+1;
4322 else
4323 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004324#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004325 }
4326 requiredsize = respos+repsize+(endp-collend);
4327 if (requiredsize > ressize) {
4328 if (requiredsize<2*ressize)
4329 requiredsize = 2*ressize;
4330 if (_PyBytes_Resize(&res, requiredsize))
4331 goto onError;
4332 str = PyBytes_AS_STRING(res) + respos;
4333 ressize = requiredsize;
4334 }
4335 /* generate replacement (temporarily (mis)uses p) */
4336 for (p = collstart; p < collend; ++p) {
4337 str += sprintf(str, "&#%d;", (int)*p);
4338 }
4339 p = collend;
4340 break;
4341 default:
4342 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4343 encoding, reason, startp, size, &exc,
4344 collstart-startp, collend-startp, &newpos);
4345 if (repunicode == NULL)
4346 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004347 if (PyBytes_Check(repunicode)) {
4348 /* Directly copy bytes result to output. */
4349 repsize = PyBytes_Size(repunicode);
4350 if (repsize > 1) {
4351 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004352 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004353 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4354 Py_DECREF(repunicode);
4355 goto onError;
4356 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004357 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004358 ressize += repsize-1;
4359 }
4360 memcpy(str, PyBytes_AsString(repunicode), repsize);
4361 str += repsize;
4362 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004363 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004364 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 /* need more space? (at least enough for what we
4367 have+the replacement+the rest of the string, so
4368 we won't have to check space for encodable characters) */
4369 respos = str - PyBytes_AS_STRING(res);
4370 repsize = PyUnicode_GET_SIZE(repunicode);
4371 requiredsize = respos+repsize+(endp-collend);
4372 if (requiredsize > ressize) {
4373 if (requiredsize<2*ressize)
4374 requiredsize = 2*ressize;
4375 if (_PyBytes_Resize(&res, requiredsize)) {
4376 Py_DECREF(repunicode);
4377 goto onError;
4378 }
4379 str = PyBytes_AS_STRING(res) + respos;
4380 ressize = requiredsize;
4381 }
4382 /* check if there is anything unencodable in the replacement
4383 and copy it to the output */
4384 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4385 c = *uni2;
4386 if (c >= limit) {
4387 raise_encode_exception(&exc, encoding, startp, size,
4388 unicodepos, unicodepos+1, reason);
4389 Py_DECREF(repunicode);
4390 goto onError;
4391 }
4392 *str = (char)c;
4393 }
4394 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004395 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004396 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004397 }
4398 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004399 /* Resize if we allocated to much */
4400 size = str - PyBytes_AS_STRING(res);
4401 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004402 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004403 if (_PyBytes_Resize(&res, size) < 0)
4404 goto onError;
4405 }
4406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_XDECREF(errorHandler);
4408 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004409 return res;
4410
4411 onError:
4412 Py_XDECREF(res);
4413 Py_XDECREF(errorHandler);
4414 Py_XDECREF(exc);
4415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416}
4417
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_ssize_t size,
4420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423}
4424
4425PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4426{
4427 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 PyErr_BadArgument();
4429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 }
4431 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 PyUnicode_GET_SIZE(unicode),
4433 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434}
4435
4436/* --- 7-bit ASCII Codec -------------------------------------------------- */
4437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 Py_ssize_t size,
4440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 PyUnicodeObject *v;
4444 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445 Py_ssize_t startinpos;
4446 Py_ssize_t endinpos;
4447 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 const char *e;
4449 PyObject *errorHandler = NULL;
4450 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004453 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 Py_UNICODE r = *(unsigned char*)s;
4455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004456 }
Tim Petersced69f82003-09-16 20:30:58 +00004457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 v = _PyUnicode_New(size);
4459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 e = s + size;
4465 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 register unsigned char c = (unsigned char)*s;
4467 if (c < 128) {
4468 *p++ = c;
4469 ++s;
4470 }
4471 else {
4472 startinpos = s-starts;
4473 endinpos = startinpos + 1;
4474 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4475 if (unicode_decode_call_errorhandler(
4476 errors, &errorHandler,
4477 "ascii", "ordinal not in range(128)",
4478 &starts, &e, &startinpos, &endinpos, &exc, &s,
4479 &v, &outpos, &p))
4480 goto onError;
4481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004483 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4485 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004489
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 Py_XDECREF(errorHandler);
4493 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 return NULL;
4495}
4496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 Py_ssize_t size,
4499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502}
4503
4504PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4505{
4506 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 PyErr_BadArgument();
4508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 }
4510 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 PyUnicode_GET_SIZE(unicode),
4512 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513}
4514
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004516
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004517/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004518
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004519#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004520#define NEED_RETRY
4521#endif
4522
4523/* XXX This code is limited to "true" double-byte encodings, as
4524 a) it assumes an incomplete character consists of a single byte, and
4525 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004527
4528static int is_dbcs_lead_byte(const char *s, int offset)
4529{
4530 const char *curr = s + offset;
4531
4532 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 const char *prev = CharPrev(s, curr);
4534 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004535 }
4536 return 0;
4537}
4538
4539/*
4540 * Decode MBCS string into unicode object. If 'final' is set, converts
4541 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4542 */
4543static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 const char *s, /* MBCS string */
4545 int size, /* sizeof MBCS string */
4546 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547{
4548 Py_UNICODE *p;
4549 Py_ssize_t n = 0;
4550 int usize = 0;
4551
4552 assert(size >= 0);
4553
4554 /* Skip trailing lead-byte unless 'final' is set */
4555 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557
4558 /* First get the size of the result */
4559 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4561 if (usize == 0) {
4562 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4563 return -1;
4564 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565 }
4566
4567 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 /* Create unicode object */
4569 *v = _PyUnicode_New(usize);
4570 if (*v == NULL)
4571 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572 }
4573 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 /* Extend unicode object */
4575 n = PyUnicode_GET_SIZE(*v);
4576 if (_PyUnicode_Resize(v, n + usize) < 0)
4577 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578 }
4579
4580 /* Do the conversion */
4581 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 p = PyUnicode_AS_UNICODE(*v) + n;
4583 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4584 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4585 return -1;
4586 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587 }
4588
4589 return size;
4590}
4591
4592PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 Py_ssize_t size,
4594 const char *errors,
4595 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004596{
4597 PyUnicodeObject *v = NULL;
4598 int done;
4599
4600 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004602
4603#ifdef NEED_RETRY
4604 retry:
4605 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004607 else
4608#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004610
4611 if (done < 0) {
4612 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 }
4615
4616 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004618
4619#ifdef NEED_RETRY
4620 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 s += done;
4622 size -= done;
4623 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004624 }
4625#endif
4626
4627 return (PyObject *)v;
4628}
4629
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004630PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 Py_ssize_t size,
4632 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004633{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004634 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4635}
4636
4637/*
4638 * Convert unicode into string object (MBCS).
4639 * Returns 0 if succeed, -1 otherwise.
4640 */
4641static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 const Py_UNICODE *p, /* unicode */
4643 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004644{
4645 int mbcssize = 0;
4646 Py_ssize_t n = 0;
4647
4648 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004649
4650 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004651 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4653 if (mbcssize == 0) {
4654 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4655 return -1;
4656 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004657 }
4658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 /* Create string object */
4661 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4662 if (*repr == NULL)
4663 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664 }
4665 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 /* Extend string object */
4667 n = PyBytes_Size(*repr);
4668 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4669 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004670 }
4671
4672 /* Do the conversion */
4673 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 char *s = PyBytes_AS_STRING(*repr) + n;
4675 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4676 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4677 return -1;
4678 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004679 }
4680
4681 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004682}
4683
4684PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 Py_ssize_t size,
4686 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004687{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004688 PyObject *repr = NULL;
4689 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004691#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004693 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004695 else
4696#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004698
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004699 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 Py_XDECREF(repr);
4701 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004702 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004703
4704#ifdef NEED_RETRY
4705 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 p += INT_MAX;
4707 size -= INT_MAX;
4708 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004709 }
4710#endif
4711
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004712 return repr;
4713}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004714
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004715PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4716{
4717 if (!PyUnicode_Check(unicode)) {
4718 PyErr_BadArgument();
4719 return NULL;
4720 }
4721 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 PyUnicode_GET_SIZE(unicode),
4723 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004724}
4725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004726#undef NEED_RETRY
4727
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004728#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730/* --- Character Mapping Codec -------------------------------------------- */
4731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 Py_ssize_t size,
4734 PyObject *mapping,
4735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004738 Py_ssize_t startinpos;
4739 Py_ssize_t endinpos;
4740 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 PyUnicodeObject *v;
4743 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 PyObject *errorHandler = NULL;
4746 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004747 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004748 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 /* Default to Latin-1 */
4751 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754 v = _PyUnicode_New(size);
4755 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004761 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 mapstring = PyUnicode_AS_UNICODE(mapping);
4763 maplen = PyUnicode_GET_SIZE(mapping);
4764 while (s < e) {
4765 unsigned char ch = *s;
4766 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 if (ch < maplen)
4769 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 if (x == 0xfffe) {
4772 /* undefined mapping */
4773 outpos = p-PyUnicode_AS_UNICODE(v);
4774 startinpos = s-starts;
4775 endinpos = startinpos+1;
4776 if (unicode_decode_call_errorhandler(
4777 errors, &errorHandler,
4778 "charmap", "character maps to <undefined>",
4779 &starts, &e, &startinpos, &endinpos, &exc, &s,
4780 &v, &outpos, &p)) {
4781 goto onError;
4782 }
4783 continue;
4784 }
4785 *p++ = x;
4786 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004787 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004788 }
4789 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 while (s < e) {
4791 unsigned char ch = *s;
4792 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004793
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4795 w = PyLong_FromLong((long)ch);
4796 if (w == NULL)
4797 goto onError;
4798 x = PyObject_GetItem(mapping, w);
4799 Py_DECREF(w);
4800 if (x == NULL) {
4801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4802 /* No mapping found means: mapping is undefined. */
4803 PyErr_Clear();
4804 x = Py_None;
4805 Py_INCREF(x);
4806 } else
4807 goto onError;
4808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004809
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 /* Apply mapping */
4811 if (PyLong_Check(x)) {
4812 long value = PyLong_AS_LONG(x);
4813 if (value < 0 || value > 65535) {
4814 PyErr_SetString(PyExc_TypeError,
4815 "character mapping must be in range(65536)");
4816 Py_DECREF(x);
4817 goto onError;
4818 }
4819 *p++ = (Py_UNICODE)value;
4820 }
4821 else if (x == Py_None) {
4822 /* undefined mapping */
4823 outpos = p-PyUnicode_AS_UNICODE(v);
4824 startinpos = s-starts;
4825 endinpos = startinpos+1;
4826 if (unicode_decode_call_errorhandler(
4827 errors, &errorHandler,
4828 "charmap", "character maps to <undefined>",
4829 &starts, &e, &startinpos, &endinpos, &exc, &s,
4830 &v, &outpos, &p)) {
4831 Py_DECREF(x);
4832 goto onError;
4833 }
4834 Py_DECREF(x);
4835 continue;
4836 }
4837 else if (PyUnicode_Check(x)) {
4838 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004839
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 if (targetsize == 1)
4841 /* 1-1 mapping */
4842 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004843
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 else if (targetsize > 1) {
4845 /* 1-n mapping */
4846 if (targetsize > extrachars) {
4847 /* resize first */
4848 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4849 Py_ssize_t needed = (targetsize - extrachars) + \
4850 (targetsize << 2);
4851 extrachars += needed;
4852 /* XXX overflow detection missing */
4853 if (_PyUnicode_Resize(&v,
4854 PyUnicode_GET_SIZE(v) + needed) < 0) {
4855 Py_DECREF(x);
4856 goto onError;
4857 }
4858 p = PyUnicode_AS_UNICODE(v) + oldpos;
4859 }
4860 Py_UNICODE_COPY(p,
4861 PyUnicode_AS_UNICODE(x),
4862 targetsize);
4863 p += targetsize;
4864 extrachars -= targetsize;
4865 }
4866 /* 1-0 mapping: skip the character */
4867 }
4868 else {
4869 /* wrong return value */
4870 PyErr_SetString(PyExc_TypeError,
4871 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004872 Py_DECREF(x);
4873 goto onError;
4874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 Py_DECREF(x);
4876 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 }
4879 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4881 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 Py_XDECREF(errorHandler);
4883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004885
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 Py_XDECREF(errorHandler);
4888 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 Py_XDECREF(v);
4890 return NULL;
4891}
4892
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004893/* Charmap encoding: the lookup table */
4894
4895struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 PyObject_HEAD
4897 unsigned char level1[32];
4898 int count2, count3;
4899 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004900};
4901
4902static PyObject*
4903encoding_map_size(PyObject *obj, PyObject* args)
4904{
4905 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004906 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004908}
4909
4910static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004911 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 PyDoc_STR("Return the size (in bytes) of this object") },
4913 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004914};
4915
4916static void
4917encoding_map_dealloc(PyObject* o)
4918{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004919 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004920}
4921
4922static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004923 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 "EncodingMap", /*tp_name*/
4925 sizeof(struct encoding_map), /*tp_basicsize*/
4926 0, /*tp_itemsize*/
4927 /* methods */
4928 encoding_map_dealloc, /*tp_dealloc*/
4929 0, /*tp_print*/
4930 0, /*tp_getattr*/
4931 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004932 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 0, /*tp_repr*/
4934 0, /*tp_as_number*/
4935 0, /*tp_as_sequence*/
4936 0, /*tp_as_mapping*/
4937 0, /*tp_hash*/
4938 0, /*tp_call*/
4939 0, /*tp_str*/
4940 0, /*tp_getattro*/
4941 0, /*tp_setattro*/
4942 0, /*tp_as_buffer*/
4943 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4944 0, /*tp_doc*/
4945 0, /*tp_traverse*/
4946 0, /*tp_clear*/
4947 0, /*tp_richcompare*/
4948 0, /*tp_weaklistoffset*/
4949 0, /*tp_iter*/
4950 0, /*tp_iternext*/
4951 encoding_map_methods, /*tp_methods*/
4952 0, /*tp_members*/
4953 0, /*tp_getset*/
4954 0, /*tp_base*/
4955 0, /*tp_dict*/
4956 0, /*tp_descr_get*/
4957 0, /*tp_descr_set*/
4958 0, /*tp_dictoffset*/
4959 0, /*tp_init*/
4960 0, /*tp_alloc*/
4961 0, /*tp_new*/
4962 0, /*tp_free*/
4963 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004964};
4965
4966PyObject*
4967PyUnicode_BuildEncodingMap(PyObject* string)
4968{
4969 Py_UNICODE *decode;
4970 PyObject *result;
4971 struct encoding_map *mresult;
4972 int i;
4973 int need_dict = 0;
4974 unsigned char level1[32];
4975 unsigned char level2[512];
4976 unsigned char *mlevel1, *mlevel2, *mlevel3;
4977 int count2 = 0, count3 = 0;
4978
4979 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4980 PyErr_BadArgument();
4981 return NULL;
4982 }
4983 decode = PyUnicode_AS_UNICODE(string);
4984 memset(level1, 0xFF, sizeof level1);
4985 memset(level2, 0xFF, sizeof level2);
4986
4987 /* If there isn't a one-to-one mapping of NULL to \0,
4988 or if there are non-BMP characters, we need to use
4989 a mapping dictionary. */
4990 if (decode[0] != 0)
4991 need_dict = 1;
4992 for (i = 1; i < 256; i++) {
4993 int l1, l2;
4994 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004995#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004996 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004997#endif
4998 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004999 need_dict = 1;
5000 break;
5001 }
5002 if (decode[i] == 0xFFFE)
5003 /* unmapped character */
5004 continue;
5005 l1 = decode[i] >> 11;
5006 l2 = decode[i] >> 7;
5007 if (level1[l1] == 0xFF)
5008 level1[l1] = count2++;
5009 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005010 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005011 }
5012
5013 if (count2 >= 0xFF || count3 >= 0xFF)
5014 need_dict = 1;
5015
5016 if (need_dict) {
5017 PyObject *result = PyDict_New();
5018 PyObject *key, *value;
5019 if (!result)
5020 return NULL;
5021 for (i = 0; i < 256; i++) {
5022 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005023 key = PyLong_FromLong(decode[i]);
5024 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005025 if (!key || !value)
5026 goto failed1;
5027 if (PyDict_SetItem(result, key, value) == -1)
5028 goto failed1;
5029 Py_DECREF(key);
5030 Py_DECREF(value);
5031 }
5032 return result;
5033 failed1:
5034 Py_XDECREF(key);
5035 Py_XDECREF(value);
5036 Py_DECREF(result);
5037 return NULL;
5038 }
5039
5040 /* Create a three-level trie */
5041 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5042 16*count2 + 128*count3 - 1);
5043 if (!result)
5044 return PyErr_NoMemory();
5045 PyObject_Init(result, &EncodingMapType);
5046 mresult = (struct encoding_map*)result;
5047 mresult->count2 = count2;
5048 mresult->count3 = count3;
5049 mlevel1 = mresult->level1;
5050 mlevel2 = mresult->level23;
5051 mlevel3 = mresult->level23 + 16*count2;
5052 memcpy(mlevel1, level1, 32);
5053 memset(mlevel2, 0xFF, 16*count2);
5054 memset(mlevel3, 0, 128*count3);
5055 count3 = 0;
5056 for (i = 1; i < 256; i++) {
5057 int o1, o2, o3, i2, i3;
5058 if (decode[i] == 0xFFFE)
5059 /* unmapped character */
5060 continue;
5061 o1 = decode[i]>>11;
5062 o2 = (decode[i]>>7) & 0xF;
5063 i2 = 16*mlevel1[o1] + o2;
5064 if (mlevel2[i2] == 0xFF)
5065 mlevel2[i2] = count3++;
5066 o3 = decode[i] & 0x7F;
5067 i3 = 128*mlevel2[i2] + o3;
5068 mlevel3[i3] = i;
5069 }
5070 return result;
5071}
5072
5073static int
5074encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5075{
5076 struct encoding_map *map = (struct encoding_map*)mapping;
5077 int l1 = c>>11;
5078 int l2 = (c>>7) & 0xF;
5079 int l3 = c & 0x7F;
5080 int i;
5081
5082#ifdef Py_UNICODE_WIDE
5083 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005085 }
5086#endif
5087 if (c == 0)
5088 return 0;
5089 /* level 1*/
5090 i = map->level1[l1];
5091 if (i == 0xFF) {
5092 return -1;
5093 }
5094 /* level 2*/
5095 i = map->level23[16*i+l2];
5096 if (i == 0xFF) {
5097 return -1;
5098 }
5099 /* level 3 */
5100 i = map->level23[16*map->count2 + 128*i + l3];
5101 if (i == 0) {
5102 return -1;
5103 }
5104 return i;
5105}
5106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107/* Lookup the character ch in the mapping. If the character
5108 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005109 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111{
Christian Heimes217cfd12007-12-02 14:31:20 +00005112 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 PyObject *x;
5114
5115 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 x = PyObject_GetItem(mapping, w);
5118 Py_DECREF(w);
5119 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5121 /* No mapping found means: mapping is undefined. */
5122 PyErr_Clear();
5123 x = Py_None;
5124 Py_INCREF(x);
5125 return x;
5126 } else
5127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005129 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005131 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 long value = PyLong_AS_LONG(x);
5133 if (value < 0 || value > 255) {
5134 PyErr_SetString(PyExc_TypeError,
5135 "character mapping must be in range(256)");
5136 Py_DECREF(x);
5137 return NULL;
5138 }
5139 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005141 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 /* wrong return value */
5145 PyErr_Format(PyExc_TypeError,
5146 "character mapping must return integer, bytes or None, not %.400s",
5147 x->ob_type->tp_name);
5148 Py_DECREF(x);
5149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 }
5151}
5152
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005153static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005154charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005155{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005156 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5157 /* exponentially overallocate to minimize reallocations */
5158 if (requiredsize < 2*outsize)
5159 requiredsize = 2*outsize;
5160 if (_PyBytes_Resize(outobj, requiredsize))
5161 return -1;
5162 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005163}
5164
Benjamin Peterson14339b62009-01-31 16:36:08 +00005165typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005167}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005169 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170 space is available. Return a new reference to the object that
5171 was put in the output buffer, or Py_None, if the mapping was undefined
5172 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005173 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005175charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005178 PyObject *rep;
5179 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005180 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181
Christian Heimes90aa7642007-12-19 02:45:37 +00005182 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005183 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005185 if (res == -1)
5186 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 if (outsize<requiredsize)
5188 if (charmapencode_resize(outobj, outpos, requiredsize))
5189 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005190 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 outstart[(*outpos)++] = (char)res;
5192 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005193 }
5194
5195 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005198 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 Py_DECREF(rep);
5200 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005201 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005202 if (PyLong_Check(rep)) {
5203 Py_ssize_t requiredsize = *outpos+1;
5204 if (outsize<requiredsize)
5205 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5206 Py_DECREF(rep);
5207 return enc_EXCEPTION;
5208 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005209 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 else {
5213 const char *repchars = PyBytes_AS_STRING(rep);
5214 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5215 Py_ssize_t requiredsize = *outpos+repsize;
5216 if (outsize<requiredsize)
5217 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5218 Py_DECREF(rep);
5219 return enc_EXCEPTION;
5220 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005221 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 memcpy(outstart + *outpos, repchars, repsize);
5223 *outpos += repsize;
5224 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005226 Py_DECREF(rep);
5227 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228}
5229
5230/* handle an error in PyUnicode_EncodeCharmap
5231 Return 0 on success, -1 on error */
5232static
5233int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005236 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005237 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238{
5239 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005240 Py_ssize_t repsize;
5241 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 Py_UNICODE *uni2;
5243 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244 Py_ssize_t collstartpos = *inpos;
5245 Py_ssize_t collendpos = *inpos+1;
5246 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 char *encoding = "charmap";
5248 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005249 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 /* find all unencodable characters */
5252 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005253 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005254 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 int res = encoding_map_lookup(p[collendpos], mapping);
5256 if (res != -1)
5257 break;
5258 ++collendpos;
5259 continue;
5260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005261
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 rep = charmapencode_lookup(p[collendpos], mapping);
5263 if (rep==NULL)
5264 return -1;
5265 else if (rep!=Py_None) {
5266 Py_DECREF(rep);
5267 break;
5268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 }
5272 /* cache callback name lookup
5273 * (if not done yet, i.e. it's the first error) */
5274 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 if ((errors==NULL) || (!strcmp(errors, "strict")))
5276 *known_errorHandler = 1;
5277 else if (!strcmp(errors, "replace"))
5278 *known_errorHandler = 2;
5279 else if (!strcmp(errors, "ignore"))
5280 *known_errorHandler = 3;
5281 else if (!strcmp(errors, "xmlcharrefreplace"))
5282 *known_errorHandler = 4;
5283 else
5284 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 }
5286 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005287 case 1: /* strict */
5288 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5289 return -1;
5290 case 2: /* replace */
5291 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 x = charmapencode_output('?', mapping, res, respos);
5293 if (x==enc_EXCEPTION) {
5294 return -1;
5295 }
5296 else if (x==enc_FAILED) {
5297 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5298 return -1;
5299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005300 }
5301 /* fall through */
5302 case 3: /* ignore */
5303 *inpos = collendpos;
5304 break;
5305 case 4: /* xmlcharrefreplace */
5306 /* generate replacement (temporarily (mis)uses p) */
5307 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 char buffer[2+29+1+1];
5309 char *cp;
5310 sprintf(buffer, "&#%d;", (int)p[collpos]);
5311 for (cp = buffer; *cp; ++cp) {
5312 x = charmapencode_output(*cp, mapping, res, respos);
5313 if (x==enc_EXCEPTION)
5314 return -1;
5315 else if (x==enc_FAILED) {
5316 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5317 return -1;
5318 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005319 }
5320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005321 *inpos = collendpos;
5322 break;
5323 default:
5324 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 encoding, reason, p, size, exceptionObject,
5326 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005327 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005329 if (PyBytes_Check(repunicode)) {
5330 /* Directly copy bytes result to output. */
5331 Py_ssize_t outsize = PyBytes_Size(*res);
5332 Py_ssize_t requiredsize;
5333 repsize = PyBytes_Size(repunicode);
5334 requiredsize = *respos + repsize;
5335 if (requiredsize > outsize)
5336 /* Make room for all additional bytes. */
5337 if (charmapencode_resize(res, respos, requiredsize)) {
5338 Py_DECREF(repunicode);
5339 return -1;
5340 }
5341 memcpy(PyBytes_AsString(*res) + *respos,
5342 PyBytes_AsString(repunicode), repsize);
5343 *respos += repsize;
5344 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005345 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005346 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 /* generate replacement */
5349 repsize = PyUnicode_GET_SIZE(repunicode);
5350 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 x = charmapencode_output(*uni2, mapping, res, respos);
5352 if (x==enc_EXCEPTION) {
5353 return -1;
5354 }
5355 else if (x==enc_FAILED) {
5356 Py_DECREF(repunicode);
5357 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5358 return -1;
5359 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005360 }
5361 *inpos = newpos;
5362 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 }
5364 return 0;
5365}
5366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 Py_ssize_t size,
5369 PyObject *mapping,
5370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 /* output object */
5373 PyObject *res = NULL;
5374 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005375 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 PyObject *errorHandler = NULL;
5379 PyObject *exc = NULL;
5380 /* the following variable is used for caching string comparisons
5381 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5382 * 3=ignore, 4=xmlcharrefreplace */
5383 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384
5385 /* Default to Latin-1 */
5386 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 /* allocate enough for a simple encoding without
5390 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005391 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 if (res == NULL)
5393 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 /* try to encode it */
5399 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5400 if (x==enc_EXCEPTION) /* error */
5401 goto onError;
5402 if (x==enc_FAILED) { /* unencodable character */
5403 if (charmap_encoding_error(p, size, &inpos, mapping,
5404 &exc,
5405 &known_errorHandler, &errorHandler, errors,
5406 &res, &respos)) {
5407 goto onError;
5408 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 else
5411 /* done with this character => adjust input position */
5412 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005416 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005417 if (_PyBytes_Resize(&res, respos) < 0)
5418 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005420 Py_XDECREF(exc);
5421 Py_XDECREF(errorHandler);
5422 return res;
5423
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 Py_XDECREF(res);
5426 Py_XDECREF(exc);
5427 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 return NULL;
5429}
5430
5431PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
5434 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 PyErr_BadArgument();
5436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 }
5438 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 PyUnicode_GET_SIZE(unicode),
5440 mapping,
5441 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444/* create or adjust a UnicodeTranslateError */
5445static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 const Py_UNICODE *unicode, Py_ssize_t size,
5447 Py_ssize_t startpos, Py_ssize_t endpos,
5448 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005451 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 }
5454 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5456 goto onError;
5457 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5458 goto onError;
5459 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5460 goto onError;
5461 return;
5462 onError:
5463 Py_DECREF(*exceptionObject);
5464 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 }
5466}
5467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468/* raises a UnicodeTranslateError */
5469static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 const Py_UNICODE *unicode, Py_ssize_t size,
5471 Py_ssize_t startpos, Py_ssize_t endpos,
5472 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473{
5474 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478}
5479
5480/* error handling callback helper:
5481 build arguments, call the callback and check the arguments,
5482 put the result into newpos and return the replacement string, which
5483 has to be freed by the caller */
5484static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 PyObject **errorHandler,
5486 const char *reason,
5487 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5488 Py_ssize_t startpos, Py_ssize_t endpos,
5489 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005491 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005493 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 PyObject *restuple;
5495 PyObject *resunicode;
5496
5497 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 }
5502
5503 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507
5508 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005513 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 Py_DECREF(restuple);
5515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 }
5517 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 &resunicode, &i_newpos)) {
5519 Py_DECREF(restuple);
5520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005522 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005524 else
5525 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005526 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5528 Py_DECREF(restuple);
5529 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 Py_INCREF(resunicode);
5532 Py_DECREF(restuple);
5533 return resunicode;
5534}
5535
5536/* Lookup the character ch in the mapping and put the result in result,
5537 which must be decrefed by the caller.
5538 Return 0 on success, -1 on error */
5539static
5540int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5541{
Christian Heimes217cfd12007-12-02 14:31:20 +00005542 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 PyObject *x;
5544
5545 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 x = PyObject_GetItem(mapping, w);
5548 Py_DECREF(w);
5549 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5551 /* No mapping found means: use 1:1 mapping. */
5552 PyErr_Clear();
5553 *result = NULL;
5554 return 0;
5555 } else
5556 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 }
5558 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 *result = x;
5560 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005562 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 long value = PyLong_AS_LONG(x);
5564 long max = PyUnicode_GetMax();
5565 if (value < 0 || value > max) {
5566 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005567 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 Py_DECREF(x);
5569 return -1;
5570 }
5571 *result = x;
5572 return 0;
5573 }
5574 else if (PyUnicode_Check(x)) {
5575 *result = x;
5576 return 0;
5577 }
5578 else {
5579 /* wrong return value */
5580 PyErr_SetString(PyExc_TypeError,
5581 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 Py_DECREF(x);
5583 return -1;
5584 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585}
5586/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 if not reallocate and adjust various state variables.
5588 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005589static
Walter Dörwald4894c302003-10-24 14:25:28 +00005590int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005593 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005594 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 /* remember old output position */
5596 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5597 /* exponentially overallocate to minimize reallocations */
5598 if (requiredsize < 2 * oldsize)
5599 requiredsize = 2 * oldsize;
5600 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5601 return -1;
5602 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 }
5604 return 0;
5605}
5606/* lookup the character, put the result in the output string and adjust
5607 various state variables. Return a new reference to the object that
5608 was put in the output buffer in *result, or Py_None, if the mapping was
5609 undefined (in which case no character was written).
5610 The called must decref result.
5611 Return 0 on success, -1 on error. */
5612static
Walter Dörwald4894c302003-10-24 14:25:28 +00005613int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5615 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616{
Walter Dörwald4894c302003-10-24 14:25:28 +00005617 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* not found => default to 1:1 mapping */
5621 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 }
5623 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005625 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* no overflow check, because we know that the space is enough */
5627 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 }
5629 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5631 if (repsize==1) {
5632 /* no overflow check, because we know that the space is enough */
5633 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5634 }
5635 else if (repsize!=0) {
5636 /* more than one character */
5637 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5638 (insize - (curinp-startinp)) +
5639 repsize - 1;
5640 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5641 return -1;
5642 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5643 *outp += repsize;
5644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 }
5646 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 return 0;
5649}
5650
5651PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 Py_ssize_t size,
5653 PyObject *mapping,
5654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 /* output object */
5657 PyObject *res = NULL;
5658 /* pointers to the beginning and end+1 of input */
5659 const Py_UNICODE *startp = p;
5660 const Py_UNICODE *endp = p + size;
5661 /* pointer into the output */
5662 Py_UNICODE *str;
5663 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005664 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 char *reason = "character maps to <undefined>";
5666 PyObject *errorHandler = NULL;
5667 PyObject *exc = NULL;
5668 /* the following variable is used for caching string comparisons
5669 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5670 * 3=ignore, 4=xmlcharrefreplace */
5671 int known_errorHandler = -1;
5672
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 PyErr_BadArgument();
5675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677
5678 /* allocate enough for a simple 1:1 translation without
5679 replacements, if we need more, we'll resize */
5680 res = PyUnicode_FromUnicode(NULL, size);
5681 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 /* try to encode it */
5689 PyObject *x = NULL;
5690 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5691 Py_XDECREF(x);
5692 goto onError;
5693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005694 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 if (x!=Py_None) /* it worked => adjust input pointer */
5696 ++p;
5697 else { /* untranslatable character */
5698 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5699 Py_ssize_t repsize;
5700 Py_ssize_t newpos;
5701 Py_UNICODE *uni2;
5702 /* startpos for collecting untranslatable chars */
5703 const Py_UNICODE *collstart = p;
5704 const Py_UNICODE *collend = p+1;
5705 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 /* find all untranslatable characters */
5708 while (collend < endp) {
5709 if (charmaptranslate_lookup(*collend, mapping, &x))
5710 goto onError;
5711 Py_XDECREF(x);
5712 if (x!=Py_None)
5713 break;
5714 ++collend;
5715 }
5716 /* cache callback name lookup
5717 * (if not done yet, i.e. it's the first error) */
5718 if (known_errorHandler==-1) {
5719 if ((errors==NULL) || (!strcmp(errors, "strict")))
5720 known_errorHandler = 1;
5721 else if (!strcmp(errors, "replace"))
5722 known_errorHandler = 2;
5723 else if (!strcmp(errors, "ignore"))
5724 known_errorHandler = 3;
5725 else if (!strcmp(errors, "xmlcharrefreplace"))
5726 known_errorHandler = 4;
5727 else
5728 known_errorHandler = 0;
5729 }
5730 switch (known_errorHandler) {
5731 case 1: /* strict */
5732 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005733 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 case 2: /* replace */
5735 /* No need to check for space, this is a 1:1 replacement */
5736 for (coll = collstart; coll<collend; ++coll)
5737 *str++ = '?';
5738 /* fall through */
5739 case 3: /* ignore */
5740 p = collend;
5741 break;
5742 case 4: /* xmlcharrefreplace */
5743 /* generate replacement (temporarily (mis)uses p) */
5744 for (p = collstart; p < collend; ++p) {
5745 char buffer[2+29+1+1];
5746 char *cp;
5747 sprintf(buffer, "&#%d;", (int)*p);
5748 if (charmaptranslate_makespace(&res, &str,
5749 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5750 goto onError;
5751 for (cp = buffer; *cp; ++cp)
5752 *str++ = *cp;
5753 }
5754 p = collend;
5755 break;
5756 default:
5757 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5758 reason, startp, size, &exc,
5759 collstart-startp, collend-startp, &newpos);
5760 if (repunicode == NULL)
5761 goto onError;
5762 /* generate replacement */
5763 repsize = PyUnicode_GET_SIZE(repunicode);
5764 if (charmaptranslate_makespace(&res, &str,
5765 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5766 Py_DECREF(repunicode);
5767 goto onError;
5768 }
5769 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5770 *str++ = *uni2;
5771 p = startp + newpos;
5772 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 }
5775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 /* Resize if we allocated to much */
5777 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005778 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 if (PyUnicode_Resize(&res, respos) < 0)
5780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 }
5782 Py_XDECREF(exc);
5783 Py_XDECREF(errorHandler);
5784 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 Py_XDECREF(res);
5788 Py_XDECREF(exc);
5789 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
5791}
5792
5793PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 PyObject *mapping,
5795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
5797 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 str = PyUnicode_FromObject(str);
5800 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 PyUnicode_GET_SIZE(str),
5804 mapping,
5805 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 Py_DECREF(str);
5807 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005808
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 Py_XDECREF(str);
5811 return NULL;
5812}
Tim Petersced69f82003-09-16 20:30:58 +00005813
Guido van Rossum9e896b32000-04-05 20:11:21 +00005814/* --- Decimal Encoder ---------------------------------------------------- */
5815
5816int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 Py_ssize_t length,
5818 char *output,
5819 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005820{
5821 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 PyObject *errorHandler = NULL;
5823 PyObject *exc = NULL;
5824 const char *encoding = "decimal";
5825 const char *reason = "invalid decimal Unicode string";
5826 /* the following variable is used for caching string comparisons
5827 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5828 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005829
5830 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 PyErr_BadArgument();
5832 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005833 }
5834
5835 p = s;
5836 end = s + length;
5837 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 register Py_UNICODE ch = *p;
5839 int decimal;
5840 PyObject *repunicode;
5841 Py_ssize_t repsize;
5842 Py_ssize_t newpos;
5843 Py_UNICODE *uni2;
5844 Py_UNICODE *collstart;
5845 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005848 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 ++p;
5850 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005851 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 decimal = Py_UNICODE_TODECIMAL(ch);
5853 if (decimal >= 0) {
5854 *output++ = '0' + decimal;
5855 ++p;
5856 continue;
5857 }
5858 if (0 < ch && ch < 256) {
5859 *output++ = (char)ch;
5860 ++p;
5861 continue;
5862 }
5863 /* All other characters are considered unencodable */
5864 collstart = p;
5865 collend = p+1;
5866 while (collend < end) {
5867 if ((0 < *collend && *collend < 256) ||
5868 !Py_UNICODE_ISSPACE(*collend) ||
5869 Py_UNICODE_TODECIMAL(*collend))
5870 break;
5871 }
5872 /* cache callback name lookup
5873 * (if not done yet, i.e. it's the first error) */
5874 if (known_errorHandler==-1) {
5875 if ((errors==NULL) || (!strcmp(errors, "strict")))
5876 known_errorHandler = 1;
5877 else if (!strcmp(errors, "replace"))
5878 known_errorHandler = 2;
5879 else if (!strcmp(errors, "ignore"))
5880 known_errorHandler = 3;
5881 else if (!strcmp(errors, "xmlcharrefreplace"))
5882 known_errorHandler = 4;
5883 else
5884 known_errorHandler = 0;
5885 }
5886 switch (known_errorHandler) {
5887 case 1: /* strict */
5888 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5889 goto onError;
5890 case 2: /* replace */
5891 for (p = collstart; p < collend; ++p)
5892 *output++ = '?';
5893 /* fall through */
5894 case 3: /* ignore */
5895 p = collend;
5896 break;
5897 case 4: /* xmlcharrefreplace */
5898 /* generate replacement (temporarily (mis)uses p) */
5899 for (p = collstart; p < collend; ++p)
5900 output += sprintf(output, "&#%d;", (int)*p);
5901 p = collend;
5902 break;
5903 default:
5904 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5905 encoding, reason, s, length, &exc,
5906 collstart-s, collend-s, &newpos);
5907 if (repunicode == NULL)
5908 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005909 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005910 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005911 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5912 Py_DECREF(repunicode);
5913 goto onError;
5914 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 /* generate replacement */
5916 repsize = PyUnicode_GET_SIZE(repunicode);
5917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5918 Py_UNICODE ch = *uni2;
5919 if (Py_UNICODE_ISSPACE(ch))
5920 *output++ = ' ';
5921 else {
5922 decimal = Py_UNICODE_TODECIMAL(ch);
5923 if (decimal >= 0)
5924 *output++ = '0' + decimal;
5925 else if (0 < ch && ch < 256)
5926 *output++ = (char)ch;
5927 else {
5928 Py_DECREF(repunicode);
5929 raise_encode_exception(&exc, encoding,
5930 s, length, collstart-s, collend-s, reason);
5931 goto onError;
5932 }
5933 }
5934 }
5935 p = s + newpos;
5936 Py_DECREF(repunicode);
5937 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005938 }
5939 /* 0-terminate the output string */
5940 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 Py_XDECREF(exc);
5942 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005943 return 0;
5944
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 Py_XDECREF(exc);
5947 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005948 return -1;
5949}
5950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951/* --- Helpers ------------------------------------------------------------ */
5952
Eric Smith8c663262007-08-25 02:26:07 +00005953#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005955
Thomas Wouters477c8d52006-05-27 19:21:47 +00005956#include "stringlib/count.h"
5957#include "stringlib/find.h"
5958#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005959#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960
Eric Smith5807c412008-05-11 21:00:57 +00005961#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005962#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005963#include "stringlib/localeutil.h"
5964
Thomas Wouters477c8d52006-05-27 19:21:47 +00005965/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005966#define ADJUST_INDICES(start, end, len) \
5967 if (end > len) \
5968 end = len; \
5969 else if (end < 0) { \
5970 end += len; \
5971 if (end < 0) \
5972 end = 0; \
5973 } \
5974 if (start < 0) { \
5975 start += len; \
5976 if (start < 0) \
5977 start = 0; \
5978 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005979
Martin v. Löwis18e16552006-02-15 17:27:45 +00005980Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005981 PyObject *substr,
5982 Py_ssize_t start,
5983 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005985 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005986 PyUnicodeObject* str_obj;
5987 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005988
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5990 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5993 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 Py_DECREF(str_obj);
5995 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
Tim Petersced69f82003-09-16 20:30:58 +00005997
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005998 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005999 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006000 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6001 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 );
6003
6004 Py_DECREF(sub_obj);
6005 Py_DECREF(str_obj);
6006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 return result;
6008}
6009
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 PyObject *sub,
6012 Py_ssize_t start,
6013 Py_ssize_t end,
6014 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006016 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006021 sub = PyUnicode_FromObject(sub);
6022 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 Py_DECREF(str);
6024 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 }
Tim Petersced69f82003-09-16 20:30:58 +00006026
Thomas Wouters477c8d52006-05-27 19:21:47 +00006027 if (direction > 0)
6028 result = stringlib_find_slice(
6029 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6030 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6031 start, end
6032 );
6033 else
6034 result = stringlib_rfind_slice(
6035 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6036 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6037 start, end
6038 );
6039
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 Py_DECREF(sub);
6042
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 return result;
6044}
6045
Tim Petersced69f82003-09-16 20:30:58 +00006046static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 PyUnicodeObject *substring,
6049 Py_ssize_t start,
6050 Py_ssize_t end,
6051 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 if (substring->length == 0)
6054 return 1;
6055
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006056 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 end -= substring->length;
6058 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
6061 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 if (Py_UNICODE_MATCH(self, end, substring))
6063 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 } else {
6065 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 }
6068
6069 return 0;
6070}
6071
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 PyObject *substr,
6074 Py_ssize_t start,
6075 Py_ssize_t end,
6076 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006078 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 str = PyUnicode_FromObject(str);
6081 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 substr = PyUnicode_FromObject(substr);
6084 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 Py_DECREF(str);
6086 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 }
Tim Petersced69f82003-09-16 20:30:58 +00006088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 (PyUnicodeObject *)substr,
6091 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 Py_DECREF(str);
6093 Py_DECREF(substr);
6094 return result;
6095}
6096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097/* Apply fixfct filter to the Unicode object self and return a
6098 reference to the modified object */
6099
Tim Petersced69f82003-09-16 20:30:58 +00006100static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104
6105 PyUnicodeObject *u;
6106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006107 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006110
6111 Py_UNICODE_COPY(u->str, self->str, self->length);
6112
Tim Peters7a29bd52001-09-12 03:03:31 +00006113 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 /* fixfct should return TRUE if it modified the buffer. If
6115 FALSE, return a reference to the original buffer instead
6116 (to save space, not time) */
6117 Py_INCREF(self);
6118 Py_DECREF(u);
6119 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
6121 return (PyObject*) u;
6122}
6123
Tim Petersced69f82003-09-16 20:30:58 +00006124static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125int fixupper(PyUnicodeObject *self)
6126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 Py_UNICODE *s = self->str;
6129 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006133
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 ch = Py_UNICODE_TOUPPER(*s);
6135 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 *s = ch;
6138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 s++;
6140 }
6141
6142 return status;
6143}
6144
Tim Petersced69f82003-09-16 20:30:58 +00006145static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146int fixlower(PyUnicodeObject *self)
6147{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006148 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 Py_UNICODE *s = self->str;
6150 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006154
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 ch = Py_UNICODE_TOLOWER(*s);
6156 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 *s = ch;
6159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 s++;
6161 }
6162
6163 return status;
6164}
6165
Tim Petersced69f82003-09-16 20:30:58 +00006166static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167int fixswapcase(PyUnicodeObject *self)
6168{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006169 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_UNICODE *s = self->str;
6171 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006172
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 while (len-- > 0) {
6174 if (Py_UNICODE_ISUPPER(*s)) {
6175 *s = Py_UNICODE_TOLOWER(*s);
6176 status = 1;
6177 } else if (Py_UNICODE_ISLOWER(*s)) {
6178 *s = Py_UNICODE_TOUPPER(*s);
6179 status = 1;
6180 }
6181 s++;
6182 }
6183
6184 return status;
6185}
6186
Tim Petersced69f82003-09-16 20:30:58 +00006187static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188int fixcapitalize(PyUnicodeObject *self)
6189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006190 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006191 Py_UNICODE *s = self->str;
6192 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006193
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006194 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006196 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 *s = Py_UNICODE_TOUPPER(*s);
6198 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006200 s++;
6201 while (--len > 0) {
6202 if (Py_UNICODE_ISUPPER(*s)) {
6203 *s = Py_UNICODE_TOLOWER(*s);
6204 status = 1;
6205 }
6206 s++;
6207 }
6208 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209}
6210
6211static
6212int fixtitle(PyUnicodeObject *self)
6213{
6214 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6215 register Py_UNICODE *e;
6216 int previous_is_cased;
6217
6218 /* Shortcut for single character strings */
6219 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6221 if (*p != ch) {
6222 *p = ch;
6223 return 1;
6224 }
6225 else
6226 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 }
Tim Petersced69f82003-09-16 20:30:58 +00006228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 e = p + PyUnicode_GET_SIZE(self);
6230 previous_is_cased = 0;
6231 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 if (previous_is_cased)
6235 *p = Py_UNICODE_TOLOWER(ch);
6236 else
6237 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006238
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 if (Py_UNICODE_ISLOWER(ch) ||
6240 Py_UNICODE_ISUPPER(ch) ||
6241 Py_UNICODE_ISTITLE(ch))
6242 previous_is_cased = 1;
6243 else
6244 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 }
6246 return 1;
6247}
6248
Tim Peters8ce9f162004-08-27 01:49:32 +00006249PyObject *
6250PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251{
Skip Montanaro6543b452004-09-16 03:28:13 +00006252 const Py_UNICODE blank = ' ';
6253 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006254 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006255 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006256 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6257 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006258 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6259 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006260 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006261 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
Tim Peters05eba1f2004-08-27 21:32:02 +00006263 fseq = PySequence_Fast(seq, "");
6264 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006265 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006266 }
6267
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006268 /* NOTE: the following code can't call back into Python code,
6269 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006270 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006271
Tim Peters05eba1f2004-08-27 21:32:02 +00006272 seqlen = PySequence_Fast_GET_SIZE(fseq);
6273 /* If empty sequence, return u"". */
6274 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006275 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6276 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006277 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006278 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006279 /* If singleton sequence with an exact Unicode, return that. */
6280 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 item = items[0];
6282 if (PyUnicode_CheckExact(item)) {
6283 Py_INCREF(item);
6284 res = (PyUnicodeObject *)item;
6285 goto Done;
6286 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006287 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006288 else {
6289 /* Set up sep and seplen */
6290 if (separator == NULL) {
6291 sep = &blank;
6292 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006293 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006294 else {
6295 if (!PyUnicode_Check(separator)) {
6296 PyErr_Format(PyExc_TypeError,
6297 "separator: expected str instance,"
6298 " %.80s found",
6299 Py_TYPE(separator)->tp_name);
6300 goto onError;
6301 }
6302 sep = PyUnicode_AS_UNICODE(separator);
6303 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006304 }
6305 }
6306
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006307 /* There are at least two things to join, or else we have a subclass
6308 * of str in the sequence.
6309 * Do a pre-pass to figure out the total amount of space we'll
6310 * need (sz), and see whether all argument are strings.
6311 */
6312 sz = 0;
6313 for (i = 0; i < seqlen; i++) {
6314 const Py_ssize_t old_sz = sz;
6315 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 if (!PyUnicode_Check(item)) {
6317 PyErr_Format(PyExc_TypeError,
6318 "sequence item %zd: expected str instance,"
6319 " %.80s found",
6320 i, Py_TYPE(item)->tp_name);
6321 goto onError;
6322 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006323 sz += PyUnicode_GET_SIZE(item);
6324 if (i != 0)
6325 sz += seplen;
6326 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6327 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006329 goto onError;
6330 }
6331 }
Tim Petersced69f82003-09-16 20:30:58 +00006332
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006333 res = _PyUnicode_New(sz);
6334 if (res == NULL)
6335 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006336
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006337 /* Catenate everything. */
6338 res_p = PyUnicode_AS_UNICODE(res);
6339 for (i = 0; i < seqlen; ++i) {
6340 Py_ssize_t itemlen;
6341 item = items[i];
6342 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 /* Copy item, and maybe the separator. */
6344 if (i) {
6345 Py_UNICODE_COPY(res_p, sep, seplen);
6346 res_p += seplen;
6347 }
6348 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6349 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006350 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006351
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006353 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 return (PyObject *)res;
6355
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006357 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006358 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 return NULL;
6360}
6361
Tim Petersced69f82003-09-16 20:30:58 +00006362static
6363PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 Py_ssize_t left,
6365 Py_ssize_t right,
6366 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
6368 PyUnicodeObject *u;
6369
6370 if (left < 0)
6371 left = 0;
6372 if (right < 0)
6373 right = 0;
6374
Tim Peters7a29bd52001-09-12 03:03:31 +00006375 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 Py_INCREF(self);
6377 return self;
6378 }
6379
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006380 if (left > PY_SSIZE_T_MAX - self->length ||
6381 right > PY_SSIZE_T_MAX - (left + self->length)) {
6382 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6383 return NULL;
6384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 u = _PyUnicode_New(left + self->length + right);
6386 if (u) {
6387 if (left)
6388 Py_UNICODE_FILL(u->str, fill, left);
6389 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6390 if (right)
6391 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6392 }
6393
6394 return u;
6395}
6396
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006397PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401 string = PyUnicode_FromObject(string);
6402 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006405 list = stringlib_splitlines(
6406 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6407 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 Py_DECREF(string);
6410 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411}
6412
Tim Petersced69f82003-09-16 20:30:58 +00006413static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 PyUnicodeObject *substring,
6416 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006419 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006422 return stringlib_split_whitespace(
6423 (PyObject*) self, self->str, self->length, maxcount
6424 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006426 return stringlib_split(
6427 (PyObject*) self, self->str, self->length,
6428 substring->str, substring->length,
6429 maxcount
6430 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431}
6432
Tim Petersced69f82003-09-16 20:30:58 +00006433static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006434PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 PyUnicodeObject *substring,
6436 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006437{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006438 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006439 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006440
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006441 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006442 return stringlib_rsplit_whitespace(
6443 (PyObject*) self, self->str, self->length, maxcount
6444 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006445
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006446 return stringlib_rsplit(
6447 (PyObject*) self, self->str, self->length,
6448 substring->str, substring->length,
6449 maxcount
6450 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006451}
6452
6453static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 PyUnicodeObject *str1,
6456 PyUnicodeObject *str2,
6457 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458{
6459 PyUnicodeObject *u;
6460
6461 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006463 else if (maxcount == 0 || self->length == 0)
6464 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
Thomas Wouters477c8d52006-05-27 19:21:47 +00006466 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006467 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006468 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006469 if (str1->length == 0)
6470 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006471 if (str1->length == 1) {
6472 /* replace characters */
6473 Py_UNICODE u1, u2;
6474 if (!findchar(self->str, self->length, str1->str[0]))
6475 goto nothing;
6476 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6477 if (!u)
6478 return NULL;
6479 Py_UNICODE_COPY(u->str, self->str, self->length);
6480 u1 = str1->str[0];
6481 u2 = str2->str[0];
6482 for (i = 0; i < u->length; i++)
6483 if (u->str[i] == u1) {
6484 if (--maxcount < 0)
6485 break;
6486 u->str[i] = u2;
6487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006489 i = stringlib_find(
6490 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492 if (i < 0)
6493 goto nothing;
6494 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6495 if (!u)
6496 return NULL;
6497 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006498
6499 /* change everything in-place, starting with this one */
6500 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6501 i += str1->length;
6502
6503 while ( --maxcount > 0) {
6504 i = stringlib_find(self->str+i, self->length-i,
6505 str1->str, str1->length,
6506 i);
6507 if (i == -1)
6508 break;
6509 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6510 i += str1->length;
6511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514
6515 Py_ssize_t n, i, j, e;
6516 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 Py_UNICODE *p;
6518
6519 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006520 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6521 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522 if (n == 0)
6523 goto nothing;
6524 /* new_size = self->length + n * (str2->length - str1->length)); */
6525 delta = (str2->length - str1->length);
6526 if (delta == 0) {
6527 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 product = n * (str2->length - str1->length);
6530 if ((product / (str2->length - str1->length)) != n) {
6531 PyErr_SetString(PyExc_OverflowError,
6532 "replace string is too long");
6533 return NULL;
6534 }
6535 new_size = self->length + product;
6536 if (new_size < 0) {
6537 PyErr_SetString(PyExc_OverflowError,
6538 "replace string is too long");
6539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
6541 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 u = _PyUnicode_New(new_size);
6543 if (!u)
6544 return NULL;
6545 i = 0;
6546 p = u->str;
6547 e = self->length - str1->length;
6548 if (str1->length > 0) {
6549 while (n-- > 0) {
6550 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006551 j = stringlib_find(self->str+i, self->length-i,
6552 str1->str, str1->length,
6553 i);
6554 if (j == -1)
6555 break;
6556 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006557 /* copy unchanged part [i:j] */
6558 Py_UNICODE_COPY(p, self->str+i, j-i);
6559 p += j - i;
6560 }
6561 /* copy substitution string */
6562 if (str2->length > 0) {
6563 Py_UNICODE_COPY(p, str2->str, str2->length);
6564 p += str2->length;
6565 }
6566 i = j + str1->length;
6567 }
6568 if (i < self->length)
6569 /* copy tail [i:] */
6570 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6571 } else {
6572 /* interleave */
6573 while (n > 0) {
6574 Py_UNICODE_COPY(p, str2->str, str2->length);
6575 p += str2->length;
6576 if (--n <= 0)
6577 break;
6578 *p++ = self->str[i++];
6579 }
6580 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 /* nothing to replace; return original string (when possible) */
6587 if (PyUnicode_CheckExact(self)) {
6588 Py_INCREF(self);
6589 return (PyObject *) self;
6590 }
6591 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
6594/* --- Unicode Object Methods --------------------------------------------- */
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598\n\
6599Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return fixup(self, fixtitle);
6606}
6607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006608PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610\n\
6611Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006612have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
6614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006615unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return fixup(self, fixcapitalize);
6618}
6619
6620#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623\n\
6624Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006625normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
6627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006628unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
6630 PyObject *list;
6631 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006632 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 /* Split into words */
6635 list = split(self, NULL, -1);
6636 if (!list)
6637 return NULL;
6638
6639 /* Capitalize each word */
6640 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6641 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 if (item == NULL)
6644 goto onError;
6645 Py_DECREF(PyList_GET_ITEM(list, i));
6646 PyList_SET_ITEM(list, i, item);
6647 }
6648
6649 /* Join the words to form a new string */
6650 item = PyUnicode_Join(NULL, list);
6651
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 Py_DECREF(list);
6654 return (PyObject *)item;
6655}
6656#endif
6657
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006658/* Argument converter. Coerces to a single unicode character */
6659
6660static int
6661convert_uc(PyObject *obj, void *addr)
6662{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006663 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6664 PyObject *uniobj;
6665 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006666
Benjamin Peterson14339b62009-01-31 16:36:08 +00006667 uniobj = PyUnicode_FromObject(obj);
6668 if (uniobj == NULL) {
6669 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006671 return 0;
6672 }
6673 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6674 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006676 Py_DECREF(uniobj);
6677 return 0;
6678 }
6679 unistr = PyUnicode_AS_UNICODE(uniobj);
6680 *fillcharloc = unistr[0];
6681 Py_DECREF(uniobj);
6682 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006683}
6684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006685PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006688Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006689done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691static PyObject *
6692unicode_center(PyUnicodeObject *self, PyObject *args)
6693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t marg, left;
6695 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006696 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
Thomas Woutersde017742006-02-16 19:34:37 +00006698 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 return NULL;
6700
Tim Peters7a29bd52001-09-12 03:03:31 +00006701 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 Py_INCREF(self);
6703 return (PyObject*) self;
6704 }
6705
6706 marg = width - self->length;
6707 left = marg / 2 + (marg & width & 1);
6708
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006709 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Marc-André Lemburge5034372000-08-08 08:04:29 +00006712#if 0
6713
6714/* This code should go into some future Unicode collation support
6715 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006716 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006717
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006718/* speedy UTF-16 code point order comparison */
6719/* gleaned from: */
6720/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6721
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006722static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006723{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006724 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006725 0, 0, 0, 0, 0, 0, 0, 0,
6726 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006727 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006728};
6729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730static int
6731unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6732{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006733 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 Py_UNICODE *s1 = str1->str;
6736 Py_UNICODE *s2 = str2->str;
6737
6738 len1 = str1->length;
6739 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006740
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006742 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006743
6744 c1 = *s1++;
6745 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006746
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 if (c1 > (1<<11) * 26)
6748 c1 += utf16Fixup[c1>>11];
6749 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006750 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006751 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006752
6753 if (c1 != c2)
6754 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006755
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006756 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 }
6758
6759 return (len1 < len2) ? -1 : (len1 != len2);
6760}
6761
Marc-André Lemburge5034372000-08-08 08:04:29 +00006762#else
6763
6764static int
6765unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006767 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006768
6769 Py_UNICODE *s1 = str1->str;
6770 Py_UNICODE *s2 = str2->str;
6771
6772 len1 = str1->length;
6773 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Marc-André Lemburge5034372000-08-08 08:04:29 +00006775 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006776 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006777
Fredrik Lundh45714e92001-06-26 16:39:36 +00006778 c1 = *s1++;
6779 c2 = *s2++;
6780
6781 if (c1 != c2)
6782 return (c1 < c2) ? -1 : 1;
6783
Marc-André Lemburge5034372000-08-08 08:04:29 +00006784 len1--; len2--;
6785 }
6786
6787 return (len1 < len2) ? -1 : (len1 != len2);
6788}
6789
6790#endif
6791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006795 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6796 return unicode_compare((PyUnicodeObject *)left,
6797 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006798 PyErr_Format(PyExc_TypeError,
6799 "Can't compare %.100s and %.100s",
6800 left->ob_type->tp_name,
6801 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 return -1;
6803}
6804
Martin v. Löwis5b222132007-06-10 09:51:05 +00006805int
6806PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6807{
6808 int i;
6809 Py_UNICODE *id;
6810 assert(PyUnicode_Check(uni));
6811 id = PyUnicode_AS_UNICODE(uni);
6812 /* Compare Unicode string and source character set string */
6813 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 if (id[i] != str[i])
6815 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006816 /* This check keeps Python strings that end in '\0' from comparing equal
6817 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006818 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006820 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006822 return 0;
6823}
6824
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006825
Benjamin Peterson29060642009-01-31 22:14:21 +00006826#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006827 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006828
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006829PyObject *PyUnicode_RichCompare(PyObject *left,
6830 PyObject *right,
6831 int op)
6832{
6833 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006834
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006835 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6836 PyObject *v;
6837 if (((PyUnicodeObject *) left)->length !=
6838 ((PyUnicodeObject *) right)->length) {
6839 if (op == Py_EQ) {
6840 Py_INCREF(Py_False);
6841 return Py_False;
6842 }
6843 if (op == Py_NE) {
6844 Py_INCREF(Py_True);
6845 return Py_True;
6846 }
6847 }
6848 if (left == right)
6849 result = 0;
6850 else
6851 result = unicode_compare((PyUnicodeObject *)left,
6852 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006853
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006854 /* Convert the return value to a Boolean */
6855 switch (op) {
6856 case Py_EQ:
6857 v = TEST_COND(result == 0);
6858 break;
6859 case Py_NE:
6860 v = TEST_COND(result != 0);
6861 break;
6862 case Py_LE:
6863 v = TEST_COND(result <= 0);
6864 break;
6865 case Py_GE:
6866 v = TEST_COND(result >= 0);
6867 break;
6868 case Py_LT:
6869 v = TEST_COND(result == -1);
6870 break;
6871 case Py_GT:
6872 v = TEST_COND(result == 1);
6873 break;
6874 default:
6875 PyErr_BadArgument();
6876 return NULL;
6877 }
6878 Py_INCREF(v);
6879 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006880 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006881
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006882 Py_INCREF(Py_NotImplemented);
6883 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006884}
6885
Guido van Rossum403d68b2000-03-13 15:55:09 +00006886int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006888{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006889 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006890 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006891
6892 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006893 sub = PyUnicode_FromObject(element);
6894 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 PyErr_Format(PyExc_TypeError,
6896 "'in <string>' requires string as left operand, not %s",
6897 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006898 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006899 }
6900
Thomas Wouters477c8d52006-05-27 19:21:47 +00006901 str = PyUnicode_FromObject(container);
6902 if (!str) {
6903 Py_DECREF(sub);
6904 return -1;
6905 }
6906
6907 result = stringlib_contains_obj(str, sub);
6908
6909 Py_DECREF(str);
6910 Py_DECREF(sub);
6911
Guido van Rossum403d68b2000-03-13 15:55:09 +00006912 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006913}
6914
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915/* Concat to string or Unicode object giving a new Unicode object. */
6916
6917PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
6920 PyUnicodeObject *u = NULL, *v = NULL, *w;
6921
6922 /* Coerce the two arguments */
6923 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6924 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6927 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930 /* Shortcuts */
6931 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 Py_DECREF(v);
6933 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
6935 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 Py_DECREF(u);
6937 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 }
6939
6940 /* Concat the two Unicode strings */
6941 w = _PyUnicode_New(u->length + v->length);
6942 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 Py_UNICODE_COPY(w->str, u->str, u->length);
6945 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6946
6947 Py_DECREF(u);
6948 Py_DECREF(v);
6949 return (PyObject *)w;
6950
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 Py_XDECREF(u);
6953 Py_XDECREF(v);
6954 return NULL;
6955}
6956
Walter Dörwald1ab83302007-05-18 17:15:44 +00006957void
6958PyUnicode_Append(PyObject **pleft, PyObject *right)
6959{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006960 PyObject *new;
6961 if (*pleft == NULL)
6962 return;
6963 if (right == NULL || !PyUnicode_Check(*pleft)) {
6964 Py_DECREF(*pleft);
6965 *pleft = NULL;
6966 return;
6967 }
6968 new = PyUnicode_Concat(*pleft, right);
6969 Py_DECREF(*pleft);
6970 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006971}
6972
6973void
6974PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6975{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006976 PyUnicode_Append(pleft, right);
6977 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006984string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
6987static PyObject *
6988unicode_count(PyUnicodeObject *self, PyObject *args)
6989{
6990 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006992 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 PyObject *result;
6994
Guido van Rossumb8872e62000-05-09 14:14:27 +00006995 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 return NULL;
6998
6999 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007000 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007003
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007004 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007005 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007007 substring->str, substring->length,
7008 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
7011 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 return result;
7014}
7015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007019Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007020to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007021handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007022a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7023'xmlcharrefreplace' as well as any other name registered with\n\
7024codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007027unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007029 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 char *encoding = NULL;
7031 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007032 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007033
Benjamin Peterson308d6372009-09-18 21:42:35 +00007034 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7035 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007037 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007038 if (v == NULL)
7039 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007040 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007041 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007042 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007043 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007044 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007045 Py_DECREF(v);
7046 return NULL;
7047 }
7048 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007049
Benjamin Peterson29060642009-01-31 22:14:21 +00007050 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007051 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
7057Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
7061unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7062{
7063 Py_UNICODE *e;
7064 Py_UNICODE *p;
7065 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 Py_UNICODE *qe;
7067 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 PyUnicodeObject *u;
7069 int tabsize = 8;
7070
7071 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
Thomas Wouters7e474022000-07-16 12:04:32 +00007074 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007075 i = 0; /* chars up to and including most recent \n or \r */
7076 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7077 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 for (p = self->str; p < e; p++)
7079 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 if (tabsize > 0) {
7081 incr = tabsize - (j % tabsize); /* cannot overflow */
7082 if (j > PY_SSIZE_T_MAX - incr)
7083 goto overflow1;
7084 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 if (j > PY_SSIZE_T_MAX - 1)
7089 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 j++;
7091 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 if (i > PY_SSIZE_T_MAX - j)
7093 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007095 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 }
7097 }
7098
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007099 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007101
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 /* Second pass: create output string and fill it */
7103 u = _PyUnicode_New(i + j);
7104 if (!u)
7105 return NULL;
7106
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007107 j = 0; /* same as in first pass */
7108 q = u->str; /* next output char */
7109 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111 for (p = self->str; p < e; p++)
7112 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 if (tabsize > 0) {
7114 i = tabsize - (j % tabsize);
7115 j += i;
7116 while (i--) {
7117 if (q >= qe)
7118 goto overflow2;
7119 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 else {
7124 if (q >= qe)
7125 goto overflow2;
7126 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007127 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 if (*p == '\n' || *p == '\r')
7129 j = 0;
7130 }
7131
7132 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007133
7134 overflow2:
7135 Py_DECREF(u);
7136 overflow1:
7137 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
7144Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007145such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146arguments start and end are interpreted as in slice notation.\n\
7147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
7151unicode_find(PyUnicodeObject *self, PyObject *args)
7152{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007154 Py_ssize_t start;
7155 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007156 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Christian Heimes9cd17752007-11-18 19:35:23 +00007158 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 result = stringlib_find_slice(
7162 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7163 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7164 start, end
7165 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168
Christian Heimes217cfd12007-12-02 14:31:20 +00007169 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170}
7171
7172static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007173unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174{
7175 if (index < 0 || index >= self->length) {
7176 PyErr_SetString(PyExc_IndexError, "string index out of range");
7177 return NULL;
7178 }
7179
7180 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7181}
7182
Guido van Rossumc2504932007-09-18 19:42:40 +00007183/* Believe it or not, this produces the same value for ASCII strings
7184 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007186unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187{
Guido van Rossumc2504932007-09-18 19:42:40 +00007188 Py_ssize_t len;
7189 Py_UNICODE *p;
7190 long x;
7191
7192 if (self->hash != -1)
7193 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007194 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007195 p = self->str;
7196 x = *p << 7;
7197 while (--len >= 0)
7198 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007199 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007200 if (x == -1)
7201 x = -2;
7202 self->hash = x;
7203 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204}
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211static PyObject *
7212unicode_index(PyUnicodeObject *self, PyObject *args)
7213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007215 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007216 Py_ssize_t start;
7217 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
Christian Heimes9cd17752007-11-18 19:35:23 +00007219 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221
Thomas Wouters477c8d52006-05-27 19:21:47 +00007222 result = stringlib_find_slice(
7223 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7224 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7225 start, end
7226 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 if (result < 0) {
7231 PyErr_SetString(PyExc_ValueError, "substring not found");
7232 return NULL;
7233 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007234
Christian Heimes217cfd12007-12-02 14:31:20 +00007235 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236}
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007241Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007242at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
7244static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007245unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
7247 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7248 register const Py_UNICODE *e;
7249 int cased;
7250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 /* Shortcut for single character strings */
7252 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007255 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007256 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 e = p + PyUnicode_GET_SIZE(self);
7260 cased = 0;
7261 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007263
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7265 return PyBool_FromLong(0);
7266 else if (!cased && Py_UNICODE_ISLOWER(ch))
7267 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007269 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270}
7271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007275Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
7278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007279unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280{
7281 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7282 register const Py_UNICODE *e;
7283 int cased;
7284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 /* Shortcut for single character strings */
7286 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007289 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007290 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 e = p + PyUnicode_GET_SIZE(self);
7294 cased = 0;
7295 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007297
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7299 return PyBool_FromLong(0);
7300 else if (!cased && Py_UNICODE_ISUPPER(ch))
7301 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007303 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304}
7305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007309Return True if S is a titlecased string and there is at least one\n\
7310character in S, i.e. upper- and titlecase characters may only\n\
7311follow uncased characters and lowercase characters only cased ones.\n\
7312Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007315unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316{
7317 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7318 register const Py_UNICODE *e;
7319 int cased, previous_is_cased;
7320
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 /* Shortcut for single character strings */
7322 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7324 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007326 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007327 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007329
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 e = p + PyUnicode_GET_SIZE(self);
7331 cased = 0;
7332 previous_is_cased = 0;
7333 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007335
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7337 if (previous_is_cased)
7338 return PyBool_FromLong(0);
7339 previous_is_cased = 1;
7340 cased = 1;
7341 }
7342 else if (Py_UNICODE_ISLOWER(ch)) {
7343 if (!previous_is_cased)
7344 return PyBool_FromLong(0);
7345 previous_is_cased = 1;
7346 cased = 1;
7347 }
7348 else
7349 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007351 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352}
7353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007357Return True if all characters in S are whitespace\n\
7358and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007361unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362{
7363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7364 register const Py_UNICODE *e;
7365
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 /* Shortcut for single character strings */
7367 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 Py_UNICODE_ISSPACE(*p))
7369 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007372 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007374
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 e = p + PyUnicode_GET_SIZE(self);
7376 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 if (!Py_UNICODE_ISSPACE(*p))
7378 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381}
7382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007385\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007386Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007388
7389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007390unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391{
7392 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7393 register const Py_UNICODE *e;
7394
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395 /* Shortcut for single character strings */
7396 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 Py_UNICODE_ISALPHA(*p))
7398 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007399
7400 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007401 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007403
7404 e = p + PyUnicode_GET_SIZE(self);
7405 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (!Py_UNICODE_ISALPHA(*p))
7407 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007409 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007410}
7411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007414\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007415Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007417
7418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007419unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007420{
7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422 register const Py_UNICODE *e;
7423
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007424 /* Shortcut for single character strings */
7425 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 Py_UNICODE_ISALNUM(*p))
7427 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007428
7429 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007430 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007432
7433 e = p + PyUnicode_GET_SIZE(self);
7434 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (!Py_UNICODE_ISALNUM(*p))
7436 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007444Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007448unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7451 register const Py_UNICODE *e;
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Shortcut for single character strings */
7454 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_UNICODE_ISDECIMAL(*p))
7456 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007459 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 e = p + PyUnicode_GET_SIZE(self);
7463 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (!Py_UNICODE_ISDECIMAL(*p))
7465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007473Return True if all characters in S are digits\n\
7474and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007477unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
7479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7480 register const Py_UNICODE *e;
7481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 /* Shortcut for single character strings */
7483 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 Py_UNICODE_ISDIGIT(*p))
7485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007488 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 e = p + PyUnicode_GET_SIZE(self);
7492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 if (!Py_UNICODE_ISDIGIT(*p))
7494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497}
7498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007502Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007506unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
7508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7509 register const Py_UNICODE *e;
7510
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 /* Shortcut for single character strings */
7512 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 Py_UNICODE_ISNUMERIC(*p))
7514 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007517 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 e = p + PyUnicode_GET_SIZE(self);
7521 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (!Py_UNICODE_ISNUMERIC(*p))
7523 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007525 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Martin v. Löwis47383402007-08-15 07:32:56 +00007528int
7529PyUnicode_IsIdentifier(PyObject *self)
7530{
7531 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7532 register const Py_UNICODE *e;
7533
7534 /* Special case for empty strings */
7535 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007537
7538 /* PEP 3131 says that the first character must be in
7539 XID_Start and subsequent characters in XID_Continue,
7540 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007541 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007542 letters, digits, underscore). However, given the current
7543 definition of XID_Start and XID_Continue, it is sufficient
7544 to check just for these, except that _ must be allowed
7545 as starting an identifier. */
7546 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7547 return 0;
7548
7549 e = p + PyUnicode_GET_SIZE(self);
7550 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 if (!_PyUnicode_IsXidContinue(*p))
7552 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007553 }
7554 return 1;
7555}
7556
7557PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007559\n\
7560Return True if S is a valid identifier according\n\
7561to the language definition.");
7562
7563static PyObject*
7564unicode_isidentifier(PyObject *self)
7565{
7566 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7567}
7568
Georg Brandl559e5d72008-06-11 18:37:52 +00007569PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007571\n\
7572Return True if all characters in S are considered\n\
7573printable in repr() or S is empty, False otherwise.");
7574
7575static PyObject*
7576unicode_isprintable(PyObject *self)
7577{
7578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7579 register const Py_UNICODE *e;
7580
7581 /* Shortcut for single character strings */
7582 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7583 Py_RETURN_TRUE;
7584 }
7585
7586 e = p + PyUnicode_GET_SIZE(self);
7587 for (; p < e; p++) {
7588 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7589 Py_RETURN_FALSE;
7590 }
7591 }
7592 Py_RETURN_TRUE;
7593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007596 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597\n\
7598Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007599iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007602unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007604 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
Martin v. Löwis18e16552006-02-15 17:27:45 +00007607static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608unicode_length(PyUnicodeObject *self)
7609{
7610 return self->length;
7611}
7612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007613PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007616Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007617done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
7619static PyObject *
7620unicode_ljust(PyUnicodeObject *self, PyObject *args)
7621{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007622 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007623 Py_UNICODE fillchar = ' ';
7624
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007625 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 return NULL;
7627
Tim Peters7a29bd52001-09-12 03:03:31 +00007628 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 Py_INCREF(self);
7630 return (PyObject*) self;
7631 }
7632
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007633 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634}
7635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007639Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007642unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return fixup(self, fixlower);
7645}
7646
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007647#define LEFTSTRIP 0
7648#define RIGHTSTRIP 1
7649#define BOTHSTRIP 2
7650
7651/* Arrays indexed by above */
7652static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7653
7654#define STRIPNAME(i) (stripformat[i]+3)
7655
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007656/* externally visible for str.strip(unicode) */
7657PyObject *
7658_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7659{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7661 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7662 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7663 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7664 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007667
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 i = 0;
7669 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7671 i++;
7672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007674
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 j = len;
7676 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 do {
7678 j--;
7679 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7680 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007681 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007682
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 Py_INCREF(self);
7685 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 }
7687 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689}
7690
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
7692static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007693do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7696 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697
Benjamin Peterson14339b62009-01-31 16:36:08 +00007698 i = 0;
7699 if (striptype != RIGHTSTRIP) {
7700 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7701 i++;
7702 }
7703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007704
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 j = len;
7706 if (striptype != LEFTSTRIP) {
7707 do {
7708 j--;
7709 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7710 j++;
7711 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007712
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7714 Py_INCREF(self);
7715 return (PyObject*)self;
7716 }
7717 else
7718 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719}
7720
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
7722static PyObject *
7723do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007726
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7728 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 if (sep != NULL && sep != Py_None) {
7731 if (PyUnicode_Check(sep))
7732 return _PyUnicode_XStrip(self, striptype, sep);
7733 else {
7734 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 "%s arg must be None or str",
7736 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007737 return NULL;
7738 }
7739 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007740
Benjamin Peterson14339b62009-01-31 16:36:08 +00007741 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007742}
7743
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007747\n\
7748Return a copy of the string S with leading and trailing\n\
7749whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007750If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751
7752static PyObject *
7753unicode_strip(PyUnicodeObject *self, PyObject *args)
7754{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 if (PyTuple_GET_SIZE(args) == 0)
7756 return do_strip(self, BOTHSTRIP); /* Common case */
7757 else
7758 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759}
7760
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007764\n\
7765Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007766If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007767
7768static PyObject *
7769unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 if (PyTuple_GET_SIZE(args) == 0)
7772 return do_strip(self, LEFTSTRIP); /* Common case */
7773 else
7774 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775}
7776
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007780\n\
7781Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007782If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783
7784static PyObject *
7785unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 if (PyTuple_GET_SIZE(args) == 0)
7788 return do_strip(self, RIGHTSTRIP); /* Common case */
7789 else
7790 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791}
7792
7793
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007795unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
7797 PyUnicodeObject *u;
7798 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007800 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
Georg Brandl222de0f2009-04-12 12:01:50 +00007802 if (len < 1) {
7803 Py_INCREF(unicode_empty);
7804 return (PyObject *)unicode_empty;
7805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806
Tim Peters7a29bd52001-09-12 03:03:31 +00007807 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 /* no repeat, return original string */
7809 Py_INCREF(str);
7810 return (PyObject*) str;
7811 }
Tim Peters8f422462000-09-09 06:13:41 +00007812
7813 /* ensure # of chars needed doesn't overflow int and # of bytes
7814 * needed doesn't overflow size_t
7815 */
7816 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007817 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007818 PyErr_SetString(PyExc_OverflowError,
7819 "repeated string is too long");
7820 return NULL;
7821 }
7822 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7823 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7824 PyErr_SetString(PyExc_OverflowError,
7825 "repeated string is too long");
7826 return NULL;
7827 }
7828 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 if (!u)
7830 return NULL;
7831
7832 p = u->str;
7833
Georg Brandl222de0f2009-04-12 12:01:50 +00007834 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007835 Py_UNICODE_FILL(p, str->str[0], len);
7836 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007837 Py_ssize_t done = str->length; /* number of characters copied this far */
7838 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007840 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007841 Py_UNICODE_COPY(p+done, p, n);
7842 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 }
7845
7846 return (PyObject*) u;
7847}
7848
7849PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 PyObject *subobj,
7851 PyObject *replobj,
7852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853{
7854 PyObject *self;
7855 PyObject *str1;
7856 PyObject *str2;
7857 PyObject *result;
7858
7859 self = PyUnicode_FromObject(obj);
7860 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 str1 = PyUnicode_FromObject(subobj);
7863 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 Py_DECREF(self);
7865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 }
7867 str2 = PyUnicode_FromObject(replobj);
7868 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 Py_DECREF(self);
7870 Py_DECREF(str1);
7871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 }
Tim Petersced69f82003-09-16 20:30:58 +00007873 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 (PyUnicodeObject *)str1,
7875 (PyUnicodeObject *)str2,
7876 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 Py_DECREF(self);
7878 Py_DECREF(str1);
7879 Py_DECREF(str2);
7880 return result;
7881}
7882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007883PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885\n\
7886Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007887old replaced by new. If the optional argument count is\n\
7888given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
7890static PyObject*
7891unicode_replace(PyUnicodeObject *self, PyObject *args)
7892{
7893 PyUnicodeObject *str1;
7894 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007895 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 PyObject *result;
7897
Martin v. Löwis18e16552006-02-15 17:27:45 +00007898 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 return NULL;
7900 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7901 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007904 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(str1);
7906 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908
7909 result = replace(self, str1, str2, maxcount);
7910
7911 Py_DECREF(str1);
7912 Py_DECREF(str2);
7913 return result;
7914}
7915
7916static
7917PyObject *unicode_repr(PyObject *unicode)
7918{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007919 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007920 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007921 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7922 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7923
7924 /* XXX(nnorwitz): rather than over-allocating, it would be
7925 better to choose a different scheme. Perhaps scan the
7926 first N-chars of the string and allocate based on that size.
7927 */
7928 /* Initial allocation is based on the longest-possible unichr
7929 escape.
7930
7931 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7932 unichr, so in this case it's the longest unichr escape. In
7933 narrow (UTF-16) builds this is five chars per source unichr
7934 since there are two unichrs in the surrogate pair, so in narrow
7935 (UTF-16) builds it's not the longest unichr escape.
7936
7937 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7938 so in the narrow (UTF-16) build case it's the longest unichr
7939 escape.
7940 */
7941
Walter Dörwald1ab83302007-05-18 17:15:44 +00007942 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007944#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007946#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007948#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007950 if (repr == NULL)
7951 return NULL;
7952
Walter Dörwald1ab83302007-05-18 17:15:44 +00007953 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007954
7955 /* Add quote */
7956 *p++ = (findchar(s, size, '\'') &&
7957 !findchar(s, size, '"')) ? '"' : '\'';
7958 while (size-- > 0) {
7959 Py_UNICODE ch = *s++;
7960
7961 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007962 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007963 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007964 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007965 continue;
7966 }
7967
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007969 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007970 *p++ = '\\';
7971 *p++ = 't';
7972 }
7973 else if (ch == '\n') {
7974 *p++ = '\\';
7975 *p++ = 'n';
7976 }
7977 else if (ch == '\r') {
7978 *p++ = '\\';
7979 *p++ = 'r';
7980 }
7981
7982 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007983 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007984 *p++ = '\\';
7985 *p++ = 'x';
7986 *p++ = hexdigits[(ch >> 4) & 0x000F];
7987 *p++ = hexdigits[ch & 0x000F];
7988 }
7989
Georg Brandl559e5d72008-06-11 18:37:52 +00007990 /* Copy ASCII characters as-is */
7991 else if (ch < 0x7F) {
7992 *p++ = ch;
7993 }
7994
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007996 else {
7997 Py_UCS4 ucs = ch;
7998
7999#ifndef Py_UNICODE_WIDE
8000 Py_UNICODE ch2 = 0;
8001 /* Get code point from surrogate pair */
8002 if (size > 0) {
8003 ch2 = *s;
8004 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008009 size--;
8010 }
8011 }
8012#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008014 (categories Z* and C* except ASCII space)
8015 */
8016 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8017 /* Map 8-bit characters to '\xhh' */
8018 if (ucs <= 0xff) {
8019 *p++ = '\\';
8020 *p++ = 'x';
8021 *p++ = hexdigits[(ch >> 4) & 0x000F];
8022 *p++ = hexdigits[ch & 0x000F];
8023 }
8024 /* Map 21-bit characters to '\U00xxxxxx' */
8025 else if (ucs >= 0x10000) {
8026 *p++ = '\\';
8027 *p++ = 'U';
8028 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8029 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8030 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8031 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8032 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8033 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8034 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8035 *p++ = hexdigits[ucs & 0x0000000F];
8036 }
8037 /* Map 16-bit characters to '\uxxxx' */
8038 else {
8039 *p++ = '\\';
8040 *p++ = 'u';
8041 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8042 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8043 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8044 *p++ = hexdigits[ucs & 0x000F];
8045 }
8046 }
8047 /* Copy characters as-is */
8048 else {
8049 *p++ = ch;
8050#ifndef Py_UNICODE_WIDE
8051 if (ucs >= 0x10000)
8052 *p++ = ch2;
8053#endif
8054 }
8055 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008056 }
8057 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008058 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008059
8060 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008061 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008062 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063}
8064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008065PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067\n\
8068Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008069such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070arguments start and end are interpreted as in slice notation.\n\
8071\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008072Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073
8074static PyObject *
8075unicode_rfind(PyUnicodeObject *self, PyObject *args)
8076{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008077 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008078 Py_ssize_t start;
8079 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008080 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
Christian Heimes9cd17752007-11-18 19:35:23 +00008082 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085 result = stringlib_rfind_slice(
8086 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8087 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8088 start, end
8089 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090
8091 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008092
Christian Heimes217cfd12007-12-02 14:31:20 +00008093 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094}
8095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008096PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008099Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100
8101static PyObject *
8102unicode_rindex(PyUnicodeObject *self, PyObject *args)
8103{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008104 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008105 Py_ssize_t start;
8106 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008107 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
Christian Heimes9cd17752007-11-18 19:35:23 +00008109 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111
Thomas Wouters477c8d52006-05-27 19:21:47 +00008112 result = stringlib_rfind_slice(
8113 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8114 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8115 start, end
8116 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
8118 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008119
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 if (result < 0) {
8121 PyErr_SetString(PyExc_ValueError, "substring not found");
8122 return NULL;
8123 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008124 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125}
8126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008127PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008130Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008131done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132
8133static PyObject *
8134unicode_rjust(PyUnicodeObject *self, PyObject *args)
8135{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008136 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008137 Py_UNICODE fillchar = ' ';
8138
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008139 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 return NULL;
8141
Tim Peters7a29bd52001-09-12 03:03:31 +00008142 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 Py_INCREF(self);
8144 return (PyObject*) self;
8145 }
8146
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008147 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148}
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 PyObject *sep,
8152 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153{
8154 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008155
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 s = PyUnicode_FromObject(s);
8157 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 if (sep != NULL) {
8160 sep = PyUnicode_FromObject(sep);
8161 if (sep == NULL) {
8162 Py_DECREF(s);
8163 return NULL;
8164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 }
8166
8167 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8168
8169 Py_DECREF(s);
8170 Py_XDECREF(sep);
8171 return result;
8172}
8173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176\n\
8177Return a list of the words in S, using sep as the\n\
8178delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008179splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008180whitespace string is a separator and empty strings are\n\
8181removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
8183static PyObject*
8184unicode_split(PyUnicodeObject *self, PyObject *args)
8185{
8186 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008187 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Martin v. Löwis18e16552006-02-15 17:27:45 +00008189 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 return NULL;
8191
8192 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198}
8199
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200PyObject *
8201PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8202{
8203 PyObject* str_obj;
8204 PyObject* sep_obj;
8205 PyObject* out;
8206
8207 str_obj = PyUnicode_FromObject(str_in);
8208 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210 sep_obj = PyUnicode_FromObject(sep_in);
8211 if (!sep_obj) {
8212 Py_DECREF(str_obj);
8213 return NULL;
8214 }
8215
8216 out = stringlib_partition(
8217 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8218 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8219 );
8220
8221 Py_DECREF(sep_obj);
8222 Py_DECREF(str_obj);
8223
8224 return out;
8225}
8226
8227
8228PyObject *
8229PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8230{
8231 PyObject* str_obj;
8232 PyObject* sep_obj;
8233 PyObject* out;
8234
8235 str_obj = PyUnicode_FromObject(str_in);
8236 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238 sep_obj = PyUnicode_FromObject(sep_in);
8239 if (!sep_obj) {
8240 Py_DECREF(str_obj);
8241 return NULL;
8242 }
8243
8244 out = stringlib_rpartition(
8245 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8246 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8247 );
8248
8249 Py_DECREF(sep_obj);
8250 Py_DECREF(str_obj);
8251
8252 return out;
8253}
8254
8255PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008257\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008258Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008260found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008261
8262static PyObject*
8263unicode_partition(PyUnicodeObject *self, PyObject *separator)
8264{
8265 return PyUnicode_Partition((PyObject *)self, separator);
8266}
8267
8268PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008269 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008271Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008272the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008273separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274
8275static PyObject*
8276unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8277{
8278 return PyUnicode_RPartition((PyObject *)self, separator);
8279}
8280
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008281PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 PyObject *sep,
8283 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008284{
8285 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008286
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008287 s = PyUnicode_FromObject(s);
8288 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 if (sep != NULL) {
8291 sep = PyUnicode_FromObject(sep);
8292 if (sep == NULL) {
8293 Py_DECREF(s);
8294 return NULL;
8295 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008296 }
8297
8298 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8299
8300 Py_DECREF(s);
8301 Py_XDECREF(sep);
8302 return result;
8303}
8304
8305PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008307\n\
8308Return a list of the words in S, using sep as the\n\
8309delimiter string, starting at the end of the string and\n\
8310working to the front. If maxsplit is given, at most maxsplit\n\
8311splits are done. If sep is not specified, any whitespace string\n\
8312is a separator.");
8313
8314static PyObject*
8315unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8316{
8317 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008318 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008319
Martin v. Löwis18e16552006-02-15 17:27:45 +00008320 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008321 return NULL;
8322
8323 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008325 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008327 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008329}
8330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008331PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333\n\
8334Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008335Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008336is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338static PyObject*
8339unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8340{
Guido van Rossum86662912000-04-11 15:38:46 +00008341 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Guido van Rossum86662912000-04-11 15:38:46 +00008343 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 return NULL;
8345
Guido van Rossum86662912000-04-11 15:38:46 +00008346 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347}
8348
8349static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008350PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
Walter Dörwald346737f2007-05-31 10:44:43 +00008352 if (PyUnicode_CheckExact(self)) {
8353 Py_INCREF(self);
8354 return self;
8355 } else
8356 /* Subtype -- return genuine unicode string with the same value. */
8357 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8358 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359}
8360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008361PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363\n\
8364Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008365and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
8367static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008368unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 return fixup(self, fixswapcase);
8371}
8372
Georg Brandlceee0772007-11-27 23:48:05 +00008373PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008375\n\
8376Return a translation table usable for str.translate().\n\
8377If there is only one argument, it must be a dictionary mapping Unicode\n\
8378ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008379Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008380If there are two arguments, they must be strings of equal length, and\n\
8381in the resulting dictionary, each character in x will be mapped to the\n\
8382character at the same position in y. If there is a third argument, it\n\
8383must be a string, whose characters will be mapped to None in the result.");
8384
8385static PyObject*
8386unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8387{
8388 PyObject *x, *y = NULL, *z = NULL;
8389 PyObject *new = NULL, *key, *value;
8390 Py_ssize_t i = 0;
8391 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392
Georg Brandlceee0772007-11-27 23:48:05 +00008393 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8394 return NULL;
8395 new = PyDict_New();
8396 if (!new)
8397 return NULL;
8398 if (y != NULL) {
8399 /* x must be a string too, of equal length */
8400 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8401 if (!PyUnicode_Check(x)) {
8402 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8403 "be a string if there is a second argument");
8404 goto err;
8405 }
8406 if (PyUnicode_GET_SIZE(x) != ylen) {
8407 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8408 "arguments must have equal length");
8409 goto err;
8410 }
8411 /* create entries for translating chars in x to those in y */
8412 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008413 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8414 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008415 if (!key || !value)
8416 goto err;
8417 res = PyDict_SetItem(new, key, value);
8418 Py_DECREF(key);
8419 Py_DECREF(value);
8420 if (res < 0)
8421 goto err;
8422 }
8423 /* create entries for deleting chars in z */
8424 if (z != NULL) {
8425 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008426 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008427 if (!key)
8428 goto err;
8429 res = PyDict_SetItem(new, key, Py_None);
8430 Py_DECREF(key);
8431 if (res < 0)
8432 goto err;
8433 }
8434 }
8435 } else {
8436 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008437 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008438 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8439 "to maketrans it must be a dict");
8440 goto err;
8441 }
8442 /* copy entries into the new dict, converting string keys to int keys */
8443 while (PyDict_Next(x, &i, &key, &value)) {
8444 if (PyUnicode_Check(key)) {
8445 /* convert string keys to integer keys */
8446 PyObject *newkey;
8447 if (PyUnicode_GET_SIZE(key) != 1) {
8448 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8449 "table must be of length 1");
8450 goto err;
8451 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008452 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008453 if (!newkey)
8454 goto err;
8455 res = PyDict_SetItem(new, newkey, value);
8456 Py_DECREF(newkey);
8457 if (res < 0)
8458 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008459 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008460 /* just keep integer keys */
8461 if (PyDict_SetItem(new, key, value) < 0)
8462 goto err;
8463 } else {
8464 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8465 "be strings or integers");
8466 goto err;
8467 }
8468 }
8469 }
8470 return new;
8471 err:
8472 Py_DECREF(new);
8473 return NULL;
8474}
8475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008476PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478\n\
8479Return a copy of the string S, where all characters have been mapped\n\
8480through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008481Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008482Unmapped characters are left untouched. Characters mapped to None\n\
8483are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008486unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Georg Brandlceee0772007-11-27 23:48:05 +00008488 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489}
8490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008491PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008494Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
8496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008497unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 return fixup(self, fixupper);
8500}
8501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008502PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008505Pad a numeric string S with zeros on the left, to fill a field\n\
8506of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
8508static PyObject *
8509unicode_zfill(PyUnicodeObject *self, PyObject *args)
8510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008511 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 PyUnicodeObject *u;
8513
Martin v. Löwis18e16552006-02-15 17:27:45 +00008514 Py_ssize_t width;
8515 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 return NULL;
8517
8518 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008519 if (PyUnicode_CheckExact(self)) {
8520 Py_INCREF(self);
8521 return (PyObject*) self;
8522 }
8523 else
8524 return PyUnicode_FromUnicode(
8525 PyUnicode_AS_UNICODE(self),
8526 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 }
8529
8530 fill = width - self->length;
8531
8532 u = pad(self, fill, 0, '0');
8533
Walter Dörwald068325e2002-04-15 13:36:47 +00008534 if (u == NULL)
8535 return NULL;
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 if (u->str[fill] == '+' || u->str[fill] == '-') {
8538 /* move sign to beginning of string */
8539 u->str[0] = u->str[fill];
8540 u->str[fill] = '0';
8541 }
8542
8543 return (PyObject*) u;
8544}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
8546#if 0
8547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008548unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
Christian Heimes2202f872008-02-06 14:31:34 +00008550 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551}
8552#endif
8553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008554PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008557Return True if S starts with the specified prefix, False otherwise.\n\
8558With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008559With optional end, stop comparing S at that position.\n\
8560prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
8562static PyObject *
8563unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008566 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008568 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008569 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008570 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008572 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8574 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 if (PyTuple_Check(subobj)) {
8576 Py_ssize_t i;
8577 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8578 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008580 if (substring == NULL)
8581 return NULL;
8582 result = tailmatch(self, substring, start, end, -1);
8583 Py_DECREF(substring);
8584 if (result) {
8585 Py_RETURN_TRUE;
8586 }
8587 }
8588 /* nothing matched */
8589 Py_RETURN_FALSE;
8590 }
8591 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008594 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008596 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597}
8598
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008603Return True if S ends with the specified suffix, False otherwise.\n\
8604With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008605With optional end, stop comparing S at that position.\n\
8606suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
8608static PyObject *
8609unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008612 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008614 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008615 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008616 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008618 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8620 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008621 if (PyTuple_Check(subobj)) {
8622 Py_ssize_t i;
8623 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8624 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008626 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008628 result = tailmatch(self, substring, start, end, +1);
8629 Py_DECREF(substring);
8630 if (result) {
8631 Py_RETURN_TRUE;
8632 }
8633 }
8634 Py_RETURN_FALSE;
8635 }
8636 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008640 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008642 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643}
8644
Eric Smith8c663262007-08-25 02:26:07 +00008645#include "stringlib/string_format.h"
8646
8647PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008649\n\
8650");
8651
Eric Smith4a7d76d2008-05-30 18:10:19 +00008652static PyObject *
8653unicode__format__(PyObject* self, PyObject* args)
8654{
8655 PyObject *format_spec;
8656
8657 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8658 return NULL;
8659
8660 return _PyUnicode_FormatAdvanced(self,
8661 PyUnicode_AS_UNICODE(format_spec),
8662 PyUnicode_GET_SIZE(format_spec));
8663}
8664
Eric Smith8c663262007-08-25 02:26:07 +00008665PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008667\n\
8668");
8669
8670static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008671unicode__sizeof__(PyUnicodeObject *v)
8672{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008673 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8674 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008675}
8676
8677PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008679
8680static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008681unicode_getnewargs(PyUnicodeObject *v)
8682{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008683 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008684}
8685
8686
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687static PyMethodDef unicode_methods[] = {
8688
8689 /* Order is according to common usage: often used methods should
8690 appear first, since lookup is done sequentially. */
8691
Benjamin Peterson308d6372009-09-18 21:42:35 +00008692 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008693 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8694 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008695 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008696 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8697 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8698 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8699 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8700 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8701 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8702 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008703 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008704 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8705 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8706 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008707 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008708 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8709 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8710 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008711 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008713 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008714 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008715 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8716 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8717 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8718 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8719 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8720 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8721 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8722 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8723 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8724 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8725 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8726 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8727 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8728 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008729 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008730 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008731 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008732 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008733 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008734 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8735 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008736 {"maketrans", (PyCFunction) unicode_maketrans,
8737 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008738 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008739#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008740 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741#endif
8742
8743#if 0
8744 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008745 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746#endif
8747
Benjamin Peterson14339b62009-01-31 16:36:08 +00008748 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 {NULL, NULL}
8750};
8751
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008752static PyObject *
8753unicode_mod(PyObject *v, PyObject *w)
8754{
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 if (!PyUnicode_Check(v)) {
8756 Py_INCREF(Py_NotImplemented);
8757 return Py_NotImplemented;
8758 }
8759 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008760}
8761
8762static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008763 0, /*nb_add*/
8764 0, /*nb_subtract*/
8765 0, /*nb_multiply*/
8766 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008767};
8768
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 (lenfunc) unicode_length, /* sq_length */
8771 PyUnicode_Concat, /* sq_concat */
8772 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8773 (ssizeargfunc) unicode_getitem, /* sq_item */
8774 0, /* sq_slice */
8775 0, /* sq_ass_item */
8776 0, /* sq_ass_slice */
8777 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778};
8779
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008780static PyObject*
8781unicode_subscript(PyUnicodeObject* self, PyObject* item)
8782{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008783 if (PyIndex_Check(item)) {
8784 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008785 if (i == -1 && PyErr_Occurred())
8786 return NULL;
8787 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008788 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008789 return unicode_getitem(self, i);
8790 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008791 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008792 Py_UNICODE* source_buf;
8793 Py_UNICODE* result_buf;
8794 PyObject* result;
8795
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008796 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008798 return NULL;
8799 }
8800
8801 if (slicelength <= 0) {
8802 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008803 } else if (start == 0 && step == 1 && slicelength == self->length &&
8804 PyUnicode_CheckExact(self)) {
8805 Py_INCREF(self);
8806 return (PyObject *)self;
8807 } else if (step == 1) {
8808 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008809 } else {
8810 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008811 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8812 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 if (result_buf == NULL)
8815 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008816
8817 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8818 result_buf[i] = source_buf[cur];
8819 }
Tim Petersced69f82003-09-16 20:30:58 +00008820
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008821 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008822 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008823 return result;
8824 }
8825 } else {
8826 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8827 return NULL;
8828 }
8829}
8830
8831static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 (lenfunc)unicode_length, /* mp_length */
8833 (binaryfunc)unicode_subscript, /* mp_subscript */
8834 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008835};
8836
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838/* Helpers for PyUnicode_Format() */
8839
8840static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008843 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 (*p_argidx)++;
8846 if (arglen < 0)
8847 return args;
8848 else
8849 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 }
8851 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 return NULL;
8854}
8855
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008856/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008858static PyObject *
8859formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008861 char *p;
8862 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008864
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 x = PyFloat_AsDouble(v);
8866 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008867 return NULL;
8868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008871
Eric Smith0923d1d2009-04-16 20:16:10 +00008872 p = PyOS_double_to_string(x, type, prec,
8873 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008874 if (p == NULL)
8875 return NULL;
8876 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008877 PyMem_Free(p);
8878 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879}
8880
Tim Peters38fd5b62000-09-21 05:43:11 +00008881static PyObject*
8882formatlong(PyObject *val, int flags, int prec, int type)
8883{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 char *buf;
8885 int len;
8886 PyObject *str; /* temporary string object. */
8887 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008888
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8890 if (!str)
8891 return NULL;
8892 result = PyUnicode_FromStringAndSize(buf, len);
8893 Py_DECREF(str);
8894 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008895}
8896
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897static int
8898formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008899 size_t buflen,
8900 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008902 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008903 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 if (PyUnicode_GET_SIZE(v) == 1) {
8905 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8906 buf[1] = '\0';
8907 return 1;
8908 }
8909#ifndef Py_UNICODE_WIDE
8910 if (PyUnicode_GET_SIZE(v) == 2) {
8911 /* Decode a valid surrogate pair */
8912 int c0 = PyUnicode_AS_UNICODE(v)[0];
8913 int c1 = PyUnicode_AS_UNICODE(v)[1];
8914 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8915 0xDC00 <= c1 && c1 <= 0xDFFF) {
8916 buf[0] = c0;
8917 buf[1] = c1;
8918 buf[2] = '\0';
8919 return 2;
8920 }
8921 }
8922#endif
8923 goto onError;
8924 }
8925 else {
8926 /* Integer input truncated to a character */
8927 long x;
8928 x = PyLong_AsLong(v);
8929 if (x == -1 && PyErr_Occurred())
8930 goto onError;
8931
8932 if (x < 0 || x > 0x10ffff) {
8933 PyErr_SetString(PyExc_OverflowError,
8934 "%c arg not in range(0x110000)");
8935 return -1;
8936 }
8937
8938#ifndef Py_UNICODE_WIDE
8939 if (x > 0xffff) {
8940 x -= 0x10000;
8941 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8942 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8943 return 2;
8944 }
8945#endif
8946 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008947 buf[1] = '\0';
8948 return 1;
8949 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008950
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008952 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008954 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955}
8956
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008957/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008958 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008959*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
8965 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008966 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 int args_owned = 0;
8968 PyUnicodeObject *result = NULL;
8969 PyObject *dict = NULL;
8970 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadInternalCall();
8974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
8976 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008977 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 fmt = PyUnicode_AS_UNICODE(uformat);
8980 fmtcnt = PyUnicode_GET_SIZE(uformat);
8981
8982 reslen = rescnt = fmtcnt + 100;
8983 result = _PyUnicode_New(reslen);
8984 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 res = PyUnicode_AS_UNICODE(result);
8987
8988 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 arglen = PyTuple_Size(args);
8990 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 }
8992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 arglen = -1;
8994 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008996 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008997 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999
9000 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 if (*fmt != '%') {
9002 if (--rescnt < 0) {
9003 rescnt = fmtcnt + 100;
9004 reslen += rescnt;
9005 if (_PyUnicode_Resize(&result, reslen) < 0)
9006 goto onError;
9007 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9008 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009011 }
9012 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 /* Got a format specifier */
9014 int flags = 0;
9015 Py_ssize_t width = -1;
9016 int prec = -1;
9017 Py_UNICODE c = '\0';
9018 Py_UNICODE fill;
9019 int isnumok;
9020 PyObject *v = NULL;
9021 PyObject *temp = NULL;
9022 Py_UNICODE *pbuf;
9023 Py_UNICODE sign;
9024 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009025 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 fmt++;
9028 if (*fmt == '(') {
9029 Py_UNICODE *keystart;
9030 Py_ssize_t keylen;
9031 PyObject *key;
9032 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 if (dict == NULL) {
9035 PyErr_SetString(PyExc_TypeError,
9036 "format requires a mapping");
9037 goto onError;
9038 }
9039 ++fmt;
9040 --fmtcnt;
9041 keystart = fmt;
9042 /* Skip over balanced parentheses */
9043 while (pcount > 0 && --fmtcnt >= 0) {
9044 if (*fmt == ')')
9045 --pcount;
9046 else if (*fmt == '(')
9047 ++pcount;
9048 fmt++;
9049 }
9050 keylen = fmt - keystart - 1;
9051 if (fmtcnt < 0 || pcount > 0) {
9052 PyErr_SetString(PyExc_ValueError,
9053 "incomplete format key");
9054 goto onError;
9055 }
9056#if 0
9057 /* keys are converted to strings using UTF-8 and
9058 then looked up since Python uses strings to hold
9059 variables names etc. in its namespaces and we
9060 wouldn't want to break common idioms. */
9061 key = PyUnicode_EncodeUTF8(keystart,
9062 keylen,
9063 NULL);
9064#else
9065 key = PyUnicode_FromUnicode(keystart, keylen);
9066#endif
9067 if (key == NULL)
9068 goto onError;
9069 if (args_owned) {
9070 Py_DECREF(args);
9071 args_owned = 0;
9072 }
9073 args = PyObject_GetItem(dict, key);
9074 Py_DECREF(key);
9075 if (args == NULL) {
9076 goto onError;
9077 }
9078 args_owned = 1;
9079 arglen = -1;
9080 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 while (--fmtcnt >= 0) {
9083 switch (c = *fmt++) {
9084 case '-': flags |= F_LJUST; continue;
9085 case '+': flags |= F_SIGN; continue;
9086 case ' ': flags |= F_BLANK; continue;
9087 case '#': flags |= F_ALT; continue;
9088 case '0': flags |= F_ZERO; continue;
9089 }
9090 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 if (c == '*') {
9093 v = getnextarg(args, arglen, &argidx);
9094 if (v == NULL)
9095 goto onError;
9096 if (!PyLong_Check(v)) {
9097 PyErr_SetString(PyExc_TypeError,
9098 "* wants int");
9099 goto onError;
9100 }
9101 width = PyLong_AsLong(v);
9102 if (width == -1 && PyErr_Occurred())
9103 goto onError;
9104 if (width < 0) {
9105 flags |= F_LJUST;
9106 width = -width;
9107 }
9108 if (--fmtcnt >= 0)
9109 c = *fmt++;
9110 }
9111 else if (c >= '0' && c <= '9') {
9112 width = c - '0';
9113 while (--fmtcnt >= 0) {
9114 c = *fmt++;
9115 if (c < '0' || c > '9')
9116 break;
9117 if ((width*10) / 10 != width) {
9118 PyErr_SetString(PyExc_ValueError,
9119 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 }
9122 width = width*10 + (c - '0');
9123 }
9124 }
9125 if (c == '.') {
9126 prec = 0;
9127 if (--fmtcnt >= 0)
9128 c = *fmt++;
9129 if (c == '*') {
9130 v = getnextarg(args, arglen, &argidx);
9131 if (v == NULL)
9132 goto onError;
9133 if (!PyLong_Check(v)) {
9134 PyErr_SetString(PyExc_TypeError,
9135 "* wants int");
9136 goto onError;
9137 }
9138 prec = PyLong_AsLong(v);
9139 if (prec == -1 && PyErr_Occurred())
9140 goto onError;
9141 if (prec < 0)
9142 prec = 0;
9143 if (--fmtcnt >= 0)
9144 c = *fmt++;
9145 }
9146 else if (c >= '0' && c <= '9') {
9147 prec = c - '0';
9148 while (--fmtcnt >= 0) {
9149 c = Py_CHARMASK(*fmt++);
9150 if (c < '0' || c > '9')
9151 break;
9152 if ((prec*10) / 10 != prec) {
9153 PyErr_SetString(PyExc_ValueError,
9154 "prec too big");
9155 goto onError;
9156 }
9157 prec = prec*10 + (c - '0');
9158 }
9159 }
9160 } /* prec */
9161 if (fmtcnt >= 0) {
9162 if (c == 'h' || c == 'l' || c == 'L') {
9163 if (--fmtcnt >= 0)
9164 c = *fmt++;
9165 }
9166 }
9167 if (fmtcnt < 0) {
9168 PyErr_SetString(PyExc_ValueError,
9169 "incomplete format");
9170 goto onError;
9171 }
9172 if (c != '%') {
9173 v = getnextarg(args, arglen, &argidx);
9174 if (v == NULL)
9175 goto onError;
9176 }
9177 sign = 0;
9178 fill = ' ';
9179 switch (c) {
9180
9181 case '%':
9182 pbuf = formatbuf;
9183 /* presume that buffer length is at least 1 */
9184 pbuf[0] = '%';
9185 len = 1;
9186 break;
9187
9188 case 's':
9189 case 'r':
9190 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009191 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 temp = v;
9193 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
9195 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 if (c == 's')
9197 temp = PyObject_Str(v);
9198 else if (c == 'r')
9199 temp = PyObject_Repr(v);
9200 else
9201 temp = PyObject_ASCII(v);
9202 if (temp == NULL)
9203 goto onError;
9204 if (PyUnicode_Check(temp))
9205 /* nothing to do */;
9206 else {
9207 Py_DECREF(temp);
9208 PyErr_SetString(PyExc_TypeError,
9209 "%s argument has non-string str()");
9210 goto onError;
9211 }
9212 }
9213 pbuf = PyUnicode_AS_UNICODE(temp);
9214 len = PyUnicode_GET_SIZE(temp);
9215 if (prec >= 0 && len > prec)
9216 len = prec;
9217 break;
9218
9219 case 'i':
9220 case 'd':
9221 case 'u':
9222 case 'o':
9223 case 'x':
9224 case 'X':
9225 if (c == 'i')
9226 c = 'd';
9227 isnumok = 0;
9228 if (PyNumber_Check(v)) {
9229 PyObject *iobj=NULL;
9230
9231 if (PyLong_Check(v)) {
9232 iobj = v;
9233 Py_INCREF(iobj);
9234 }
9235 else {
9236 iobj = PyNumber_Long(v);
9237 }
9238 if (iobj!=NULL) {
9239 if (PyLong_Check(iobj)) {
9240 isnumok = 1;
9241 temp = formatlong(iobj, flags, prec, c);
9242 Py_DECREF(iobj);
9243 if (!temp)
9244 goto onError;
9245 pbuf = PyUnicode_AS_UNICODE(temp);
9246 len = PyUnicode_GET_SIZE(temp);
9247 sign = 1;
9248 }
9249 else {
9250 Py_DECREF(iobj);
9251 }
9252 }
9253 }
9254 if (!isnumok) {
9255 PyErr_Format(PyExc_TypeError,
9256 "%%%c format: a number is required, "
9257 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9258 goto onError;
9259 }
9260 if (flags & F_ZERO)
9261 fill = '0';
9262 break;
9263
9264 case 'e':
9265 case 'E':
9266 case 'f':
9267 case 'F':
9268 case 'g':
9269 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009270 temp = formatfloat(v, flags, prec, c);
9271 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009272 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009273 pbuf = PyUnicode_AS_UNICODE(temp);
9274 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 sign = 1;
9276 if (flags & F_ZERO)
9277 fill = '0';
9278 break;
9279
9280 case 'c':
9281 pbuf = formatbuf;
9282 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9283 if (len < 0)
9284 goto onError;
9285 break;
9286
9287 default:
9288 PyErr_Format(PyExc_ValueError,
9289 "unsupported format character '%c' (0x%x) "
9290 "at index %zd",
9291 (31<=c && c<=126) ? (char)c : '?',
9292 (int)c,
9293 (Py_ssize_t)(fmt - 1 -
9294 PyUnicode_AS_UNICODE(uformat)));
9295 goto onError;
9296 }
9297 if (sign) {
9298 if (*pbuf == '-' || *pbuf == '+') {
9299 sign = *pbuf++;
9300 len--;
9301 }
9302 else if (flags & F_SIGN)
9303 sign = '+';
9304 else if (flags & F_BLANK)
9305 sign = ' ';
9306 else
9307 sign = 0;
9308 }
9309 if (width < len)
9310 width = len;
9311 if (rescnt - (sign != 0) < width) {
9312 reslen -= rescnt;
9313 rescnt = width + fmtcnt + 100;
9314 reslen += rescnt;
9315 if (reslen < 0) {
9316 Py_XDECREF(temp);
9317 PyErr_NoMemory();
9318 goto onError;
9319 }
9320 if (_PyUnicode_Resize(&result, reslen) < 0) {
9321 Py_XDECREF(temp);
9322 goto onError;
9323 }
9324 res = PyUnicode_AS_UNICODE(result)
9325 + reslen - rescnt;
9326 }
9327 if (sign) {
9328 if (fill != ' ')
9329 *res++ = sign;
9330 rescnt--;
9331 if (width > len)
9332 width--;
9333 }
9334 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9335 assert(pbuf[0] == '0');
9336 assert(pbuf[1] == c);
9337 if (fill != ' ') {
9338 *res++ = *pbuf++;
9339 *res++ = *pbuf++;
9340 }
9341 rescnt -= 2;
9342 width -= 2;
9343 if (width < 0)
9344 width = 0;
9345 len -= 2;
9346 }
9347 if (width > len && !(flags & F_LJUST)) {
9348 do {
9349 --rescnt;
9350 *res++ = fill;
9351 } while (--width > len);
9352 }
9353 if (fill == ' ') {
9354 if (sign)
9355 *res++ = sign;
9356 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9357 assert(pbuf[0] == '0');
9358 assert(pbuf[1] == c);
9359 *res++ = *pbuf++;
9360 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009361 }
9362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 Py_UNICODE_COPY(res, pbuf, len);
9364 res += len;
9365 rescnt -= len;
9366 while (--width >= len) {
9367 --rescnt;
9368 *res++ = ' ';
9369 }
9370 if (dict && (argidx < arglen) && c != '%') {
9371 PyErr_SetString(PyExc_TypeError,
9372 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009373 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 goto onError;
9375 }
9376 Py_XDECREF(temp);
9377 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 } /* until end */
9379 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 PyErr_SetString(PyExc_TypeError,
9381 "not all arguments converted during string formatting");
9382 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 }
9384
Thomas Woutersa96affe2006-03-12 00:29:36 +00009385 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 }
9390 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 return (PyObject *)result;
9392
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 Py_XDECREF(result);
9395 Py_DECREF(uformat);
9396 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 }
9399 return NULL;
9400}
9401
Jeremy Hylton938ace62002-07-17 16:30:39 +00009402static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009403unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9404
Tim Peters6d6c1a32001-08-02 04:15:00 +00009405static PyObject *
9406unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9407{
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 static char *kwlist[] = {"object", "encoding", "errors", 0};
9410 char *encoding = NULL;
9411 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009412
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 if (type != &PyUnicode_Type)
9414 return unicode_subtype_new(type, args, kwds);
9415 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009417 return NULL;
9418 if (x == NULL)
9419 return (PyObject *)_PyUnicode_New(0);
9420 if (encoding == NULL && errors == NULL)
9421 return PyObject_Str(x);
9422 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009424}
9425
Guido van Rossume023fe02001-08-30 03:12:59 +00009426static PyObject *
9427unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9428{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009429 PyUnicodeObject *tmp, *pnew;
9430 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009431
Benjamin Peterson14339b62009-01-31 16:36:08 +00009432 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9433 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9434 if (tmp == NULL)
9435 return NULL;
9436 assert(PyUnicode_Check(tmp));
9437 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9438 if (pnew == NULL) {
9439 Py_DECREF(tmp);
9440 return NULL;
9441 }
9442 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9443 if (pnew->str == NULL) {
9444 _Py_ForgetReference((PyObject *)pnew);
9445 PyObject_Del(pnew);
9446 Py_DECREF(tmp);
9447 return PyErr_NoMemory();
9448 }
9449 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9450 pnew->length = n;
9451 pnew->hash = tmp->hash;
9452 Py_DECREF(tmp);
9453 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009454}
9455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009456PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009458\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009459Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009460encoding defaults to the current default string encoding.\n\
9461errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009462
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009463static PyObject *unicode_iter(PyObject *seq);
9464
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009466 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009467 "str", /* tp_name */
9468 sizeof(PyUnicodeObject), /* tp_size */
9469 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009471 (destructor)unicode_dealloc, /* tp_dealloc */
9472 0, /* tp_print */
9473 0, /* tp_getattr */
9474 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009475 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009476 unicode_repr, /* tp_repr */
9477 &unicode_as_number, /* tp_as_number */
9478 &unicode_as_sequence, /* tp_as_sequence */
9479 &unicode_as_mapping, /* tp_as_mapping */
9480 (hashfunc) unicode_hash, /* tp_hash*/
9481 0, /* tp_call*/
9482 (reprfunc) unicode_str, /* tp_str */
9483 PyObject_GenericGetAttr, /* tp_getattro */
9484 0, /* tp_setattro */
9485 0, /* tp_as_buffer */
9486 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009488 unicode_doc, /* tp_doc */
9489 0, /* tp_traverse */
9490 0, /* tp_clear */
9491 PyUnicode_RichCompare, /* tp_richcompare */
9492 0, /* tp_weaklistoffset */
9493 unicode_iter, /* tp_iter */
9494 0, /* tp_iternext */
9495 unicode_methods, /* tp_methods */
9496 0, /* tp_members */
9497 0, /* tp_getset */
9498 &PyBaseObject_Type, /* tp_base */
9499 0, /* tp_dict */
9500 0, /* tp_descr_get */
9501 0, /* tp_descr_set */
9502 0, /* tp_dictoffset */
9503 0, /* tp_init */
9504 0, /* tp_alloc */
9505 unicode_new, /* tp_new */
9506 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507};
9508
9509/* Initialize the Unicode implementation */
9510
Thomas Wouters78890102000-07-22 19:25:51 +00009511void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009513 int i;
9514
Thomas Wouters477c8d52006-05-27 19:21:47 +00009515 /* XXX - move this array to unicodectype.c ? */
9516 Py_UNICODE linebreak[] = {
9517 0x000A, /* LINE FEED */
9518 0x000D, /* CARRIAGE RETURN */
9519 0x001C, /* FILE SEPARATOR */
9520 0x001D, /* GROUP SEPARATOR */
9521 0x001E, /* RECORD SEPARATOR */
9522 0x0085, /* NEXT LINE */
9523 0x2028, /* LINE SEPARATOR */
9524 0x2029, /* PARAGRAPH SEPARATOR */
9525 };
9526
Fred Drakee4315f52000-05-09 19:53:39 +00009527 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009528 free_list = NULL;
9529 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009531 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009533
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009534 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009536 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009538
9539 /* initialize the linebreak bloom filter */
9540 bloom_linebreak = make_bloom_mask(
9541 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9542 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009543
9544 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545}
9546
9547/* Finalize the Unicode implementation */
9548
Christian Heimesa156e092008-02-16 07:38:31 +00009549int
9550PyUnicode_ClearFreeList(void)
9551{
9552 int freelist_size = numfree;
9553 PyUnicodeObject *u;
9554
9555 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 PyUnicodeObject *v = u;
9557 u = *(PyUnicodeObject **)u;
9558 if (v->str)
9559 PyObject_DEL(v->str);
9560 Py_XDECREF(v->defenc);
9561 PyObject_Del(v);
9562 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009563 }
9564 free_list = NULL;
9565 assert(numfree == 0);
9566 return freelist_size;
9567}
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569void
Thomas Wouters78890102000-07-22 19:25:51 +00009570_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009572 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009574 Py_XDECREF(unicode_empty);
9575 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009576
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009577 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 if (unicode_latin1[i]) {
9579 Py_DECREF(unicode_latin1[i]);
9580 unicode_latin1[i] = NULL;
9581 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009582 }
Christian Heimesa156e092008-02-16 07:38:31 +00009583 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009585
Walter Dörwald16807132007-05-25 13:52:07 +00009586void
9587PyUnicode_InternInPlace(PyObject **p)
9588{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9590 PyObject *t;
9591 if (s == NULL || !PyUnicode_Check(s))
9592 Py_FatalError(
9593 "PyUnicode_InternInPlace: unicode strings only please!");
9594 /* If it's a subclass, we don't really know what putting
9595 it in the interned dict might do. */
9596 if (!PyUnicode_CheckExact(s))
9597 return;
9598 if (PyUnicode_CHECK_INTERNED(s))
9599 return;
9600 if (interned == NULL) {
9601 interned = PyDict_New();
9602 if (interned == NULL) {
9603 PyErr_Clear(); /* Don't leave an exception */
9604 return;
9605 }
9606 }
9607 /* It might be that the GetItem call fails even
9608 though the key is present in the dictionary,
9609 namely when this happens during a stack overflow. */
9610 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009613
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 if (t) {
9615 Py_INCREF(t);
9616 Py_DECREF(*p);
9617 *p = t;
9618 return;
9619 }
Walter Dörwald16807132007-05-25 13:52:07 +00009620
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 PyThreadState_GET()->recursion_critical = 1;
9622 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9623 PyErr_Clear();
9624 PyThreadState_GET()->recursion_critical = 0;
9625 return;
9626 }
9627 PyThreadState_GET()->recursion_critical = 0;
9628 /* The two references in interned are not counted by refcnt.
9629 The deallocator will take care of this */
9630 Py_REFCNT(s) -= 2;
9631 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009632}
9633
9634void
9635PyUnicode_InternImmortal(PyObject **p)
9636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009637 PyUnicode_InternInPlace(p);
9638 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9639 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9640 Py_INCREF(*p);
9641 }
Walter Dörwald16807132007-05-25 13:52:07 +00009642}
9643
9644PyObject *
9645PyUnicode_InternFromString(const char *cp)
9646{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009647 PyObject *s = PyUnicode_FromString(cp);
9648 if (s == NULL)
9649 return NULL;
9650 PyUnicode_InternInPlace(&s);
9651 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009652}
9653
9654void _Py_ReleaseInternedUnicodeStrings(void)
9655{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009656 PyObject *keys;
9657 PyUnicodeObject *s;
9658 Py_ssize_t i, n;
9659 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009660
Benjamin Peterson14339b62009-01-31 16:36:08 +00009661 if (interned == NULL || !PyDict_Check(interned))
9662 return;
9663 keys = PyDict_Keys(interned);
9664 if (keys == NULL || !PyList_Check(keys)) {
9665 PyErr_Clear();
9666 return;
9667 }
Walter Dörwald16807132007-05-25 13:52:07 +00009668
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9670 detector, interned unicode strings are not forcibly deallocated;
9671 rather, we give them their stolen references back, and then clear
9672 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009673
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 n = PyList_GET_SIZE(keys);
9675 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 for (i = 0; i < n; i++) {
9678 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9679 switch (s->state) {
9680 case SSTATE_NOT_INTERNED:
9681 /* XXX Shouldn't happen */
9682 break;
9683 case SSTATE_INTERNED_IMMORTAL:
9684 Py_REFCNT(s) += 1;
9685 immortal_size += s->length;
9686 break;
9687 case SSTATE_INTERNED_MORTAL:
9688 Py_REFCNT(s) += 2;
9689 mortal_size += s->length;
9690 break;
9691 default:
9692 Py_FatalError("Inconsistent interned string state.");
9693 }
9694 s->state = SSTATE_NOT_INTERNED;
9695 }
9696 fprintf(stderr, "total size of all interned strings: "
9697 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9698 "mortal/immortal\n", mortal_size, immortal_size);
9699 Py_DECREF(keys);
9700 PyDict_Clear(interned);
9701 Py_DECREF(interned);
9702 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009703}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009704
9705
9706/********************* Unicode Iterator **************************/
9707
9708typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009709 PyObject_HEAD
9710 Py_ssize_t it_index;
9711 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009712} unicodeiterobject;
9713
9714static void
9715unicodeiter_dealloc(unicodeiterobject *it)
9716{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009717 _PyObject_GC_UNTRACK(it);
9718 Py_XDECREF(it->it_seq);
9719 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009720}
9721
9722static int
9723unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 Py_VISIT(it->it_seq);
9726 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009727}
9728
9729static PyObject *
9730unicodeiter_next(unicodeiterobject *it)
9731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732 PyUnicodeObject *seq;
9733 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009734
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 assert(it != NULL);
9736 seq = it->it_seq;
9737 if (seq == NULL)
9738 return NULL;
9739 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009740
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9742 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 if (item != NULL)
9745 ++it->it_index;
9746 return item;
9747 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009748
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 Py_DECREF(seq);
9750 it->it_seq = NULL;
9751 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009752}
9753
9754static PyObject *
9755unicodeiter_len(unicodeiterobject *it)
9756{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009757 Py_ssize_t len = 0;
9758 if (it->it_seq)
9759 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9760 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009761}
9762
9763PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9764
9765static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009768 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009769};
9770
9771PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9773 "str_iterator", /* tp_name */
9774 sizeof(unicodeiterobject), /* tp_basicsize */
9775 0, /* tp_itemsize */
9776 /* methods */
9777 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9778 0, /* tp_print */
9779 0, /* tp_getattr */
9780 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009781 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782 0, /* tp_repr */
9783 0, /* tp_as_number */
9784 0, /* tp_as_sequence */
9785 0, /* tp_as_mapping */
9786 0, /* tp_hash */
9787 0, /* tp_call */
9788 0, /* tp_str */
9789 PyObject_GenericGetAttr, /* tp_getattro */
9790 0, /* tp_setattro */
9791 0, /* tp_as_buffer */
9792 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9793 0, /* tp_doc */
9794 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9795 0, /* tp_clear */
9796 0, /* tp_richcompare */
9797 0, /* tp_weaklistoffset */
9798 PyObject_SelfIter, /* tp_iter */
9799 (iternextfunc)unicodeiter_next, /* tp_iternext */
9800 unicodeiter_methods, /* tp_methods */
9801 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009802};
9803
9804static PyObject *
9805unicode_iter(PyObject *seq)
9806{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009808
Benjamin Peterson14339b62009-01-31 16:36:08 +00009809 if (!PyUnicode_Check(seq)) {
9810 PyErr_BadInternalCall();
9811 return NULL;
9812 }
9813 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9814 if (it == NULL)
9815 return NULL;
9816 it->it_index = 0;
9817 Py_INCREF(seq);
9818 it->it_seq = (PyUnicodeObject *)seq;
9819 _PyObject_GC_TRACK(it);
9820 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009821}
9822
Martin v. Löwis5b222132007-06-10 09:51:05 +00009823size_t
9824Py_UNICODE_strlen(const Py_UNICODE *u)
9825{
9826 int res = 0;
9827 while(*u++)
9828 res++;
9829 return res;
9830}
9831
9832Py_UNICODE*
9833Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9834{
9835 Py_UNICODE *u = s1;
9836 while ((*u++ = *s2++));
9837 return s1;
9838}
9839
9840Py_UNICODE*
9841Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9842{
9843 Py_UNICODE *u = s1;
9844 while ((*u++ = *s2++))
9845 if (n-- == 0)
9846 break;
9847 return s1;
9848}
9849
9850int
9851Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9852{
9853 while (*s1 && *s2 && *s1 == *s2)
9854 s1++, s2++;
9855 if (*s1 && *s2)
9856 return (*s1 < *s2) ? -1 : +1;
9857 if (*s1)
9858 return 1;
9859 if (*s2)
9860 return -1;
9861 return 0;
9862}
9863
9864Py_UNICODE*
9865Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9866{
9867 const Py_UNICODE *p;
9868 for (p = s; *p; p++)
9869 if (*p == c)
9870 return (Py_UNICODE*)p;
9871 return NULL;
9872}
9873
9874
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009875#ifdef __cplusplus
9876}
9877#endif