blob: c50f60165bf6526173a6bb56ad859ec9840487f5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000310 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000313
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 return 0;
315}
316
317/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000318 Ux0000 terminated; some code (e.g. new_identifier)
319 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320
321 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
324*/
325
326static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000327PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328{
329 register PyUnicodeObject *unicode;
330
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 if (length == 0 && unicode_empty != NULL) {
333 Py_INCREF(unicode_empty);
334 return unicode_empty;
335 }
336
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000337 /* Ensure we won't overflow the size. */
338 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
339 return (PyUnicodeObject *)PyErr_NoMemory();
340 }
341
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000343 if (free_list) {
344 unicode = free_list;
345 free_list = *(PyUnicodeObject **)unicode;
346 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 if (unicode->str) {
348 /* Keep-Alive optimization: we only upsize the buffer,
349 never downsize it. */
350 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000351 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 PyObject_DEL(unicode->str);
353 unicode->str = NULL;
354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000359 }
360 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (unicode == NULL)
366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
368 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
370
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000371 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 PyErr_NoMemory();
373 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000374 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000376 * the caller fails before initializing str -- unicode_resize()
377 * reads str[0], and the Keep-Alive optimization can keep memory
378 * allocated for str alive across a call to unicode_dealloc(unicode).
379 * We don't want unicode_resize to read uninitialized memory in
380 * that case.
381 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000382 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000386 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000387 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000391 /* XXX UNREF/NEWREF interface should be more symmetrical */
392 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000393 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000394 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396}
397
398static
Guido van Rossum9475a232001-10-05 20:51:39 +0000399void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
Walter Dörwald16807132007-05-25 13:52:07 +0000401 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000402 case SSTATE_NOT_INTERNED:
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_MORTAL:
406 /* revive dead object temporarily for DelItem */
407 Py_REFCNT(unicode) = 3;
408 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
409 Py_FatalError(
410 "deletion of interned string failed");
411 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000412
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 case SSTATE_INTERNED_IMMORTAL:
414 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000415
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 default:
417 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000418 }
419
Guido van Rossum604ddf82001-12-06 20:03:56 +0000420 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000422 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000423 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
424 PyObject_DEL(unicode->str);
425 unicode->str = NULL;
426 unicode->length = 0;
427 }
428 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000429 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000430 }
431 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000432 *(PyUnicodeObject **)unicode = free_list;
433 free_list = unicode;
434 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyObject_DEL(unicode->str);
438 Py_XDECREF(unicode->defenc);
439 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 }
441}
442
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443static
444int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445{
446 register PyUnicodeObject *v;
447
448 /* Argument checks */
449 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 PyErr_BadInternalCall();
451 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000453 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000454 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 PyErr_BadInternalCall();
456 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
458
459 /* Resizing unicode_empty and single character objects is not
460 possible since these are being shared. We simply return a fresh
461 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000462 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 (v == unicode_empty || v->length == 1)) {
464 PyUnicodeObject *w = _PyUnicode_New(length);
465 if (w == NULL)
466 return -1;
467 Py_UNICODE_COPY(w->str, v->str,
468 length < v->length ? length : v->length);
469 Py_DECREF(*unicode);
470 *unicode = w;
471 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
473
474 /* Note that we don't have to modify *unicode for unshared Unicode
475 objects, since we can modify them in-place. */
476 return unicode_resize(v, length);
477}
478
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000479int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
480{
481 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
482}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486{
487 PyUnicodeObject *unicode;
488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000489 /* If the Unicode data is known at construction time, we can apply
490 some optimizations which share commonly used objects. */
491 if (u != NULL) {
492
Benjamin Peterson29060642009-01-31 22:14:21 +0000493 /* Optimization for empty strings */
494 if (size == 0 && unicode_empty != NULL) {
495 Py_INCREF(unicode_empty);
496 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000497 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000498
499 /* Single character Unicode objects in the Latin-1 range are
500 shared when using this constructor */
501 if (size == 1 && *u < 256) {
502 unicode = unicode_latin1[*u];
503 if (!unicode) {
504 unicode = _PyUnicode_New(1);
505 if (!unicode)
506 return NULL;
507 unicode->str[0] = *u;
508 unicode_latin1[*u] = unicode;
509 }
510 Py_INCREF(unicode);
511 return (PyObject *)unicode;
512 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518
519 /* Copy the Unicode data into the new object */
520 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522
523 return (PyObject *)unicode;
524}
525
Walter Dörwaldd2034312007-05-18 16:29:38 +0000526PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527{
528 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 if (size < 0) {
531 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 return NULL;
534 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000535
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000537 some optimizations which share commonly used objects.
538 Also, this means the input must be UTF-8, so fall back to the
539 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 if (u != NULL) {
541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542 /* Optimization for empty strings */
543 if (size == 0 && unicode_empty != NULL) {
544 Py_INCREF(unicode_empty);
545 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000546 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000547
548 /* Single characters are shared when using this constructor.
549 Restrict to ASCII, since the input must be UTF-8. */
550 if (size == 1 && Py_CHARMASK(*u) < 128) {
551 unicode = unicode_latin1[Py_CHARMASK(*u)];
552 if (!unicode) {
553 unicode = _PyUnicode_New(1);
554 if (!unicode)
555 return NULL;
556 unicode->str[0] = Py_CHARMASK(*u);
557 unicode_latin1[Py_CHARMASK(*u)] = unicode;
558 }
559 Py_INCREF(unicode);
560 return (PyObject *)unicode;
561 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000562
563 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 }
565
Walter Dörwald55507312007-05-18 13:12:10 +0000566 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 if (!unicode)
568 return NULL;
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570 return (PyObject *)unicode;
571}
572
Walter Dörwaldd2034312007-05-18 16:29:38 +0000573PyObject *PyUnicode_FromString(const char *u)
574{
575 size_t size = strlen(u);
576 if (size > PY_SSIZE_T_MAX) {
577 PyErr_SetString(PyExc_OverflowError, "input too long");
578 return NULL;
579 }
580
581 return PyUnicode_FromStringAndSize(u, size);
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584#ifdef HAVE_WCHAR_H
585
Mark Dickinson081dfee2009-03-18 14:47:41 +0000586#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
587# define CONVERT_WCHAR_TO_SURROGATES
588#endif
589
590#ifdef CONVERT_WCHAR_TO_SURROGATES
591
592/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
593 to convert from UTF32 to UTF16. */
594
595PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
596 Py_ssize_t size)
597{
598 PyUnicodeObject *unicode;
599 register Py_ssize_t i;
600 Py_ssize_t alloc;
601 const wchar_t *orig_w;
602
603 if (w == NULL) {
604 if (size == 0)
605 return PyUnicode_FromStringAndSize(NULL, 0);
606 PyErr_BadInternalCall();
607 return NULL;
608 }
609
610 if (size == -1) {
611 size = wcslen(w);
612 }
613
614 alloc = size;
615 orig_w = w;
616 for (i = size; i > 0; i--) {
617 if (*w > 0xFFFF)
618 alloc++;
619 w++;
620 }
621 w = orig_w;
622 unicode = _PyUnicode_New(alloc);
623 if (!unicode)
624 return NULL;
625
626 /* Copy the wchar_t data into the new object */
627 {
628 register Py_UNICODE *u;
629 u = PyUnicode_AS_UNICODE(unicode);
630 for (i = size; i > 0; i--) {
631 if (*w > 0xFFFF) {
632 wchar_t ordinal = *w++;
633 ordinal -= 0x10000;
634 *u++ = 0xD800 | (ordinal >> 10);
635 *u++ = 0xDC00 | (ordinal & 0x3FF);
636 }
637 else
638 *u++ = *w++;
639 }
640 }
641 return (PyObject *)unicode;
642}
643
644#else
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000647 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648{
649 PyUnicodeObject *unicode;
650
651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == 0)
653 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 PyErr_BadInternalCall();
655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 }
657
Martin v. Löwis790465f2008-04-05 20:41:37 +0000658 if (size == -1) {
659 size = wcslen(w);
660 }
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = _PyUnicode_New(size);
663 if (!unicode)
664 return NULL;
665
666 /* Copy the wchar_t data into the new object */
667#ifdef HAVE_USABLE_WCHAR_T
668 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000669#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 register Py_UNICODE *u;
672 register Py_ssize_t i;
673 u = PyUnicode_AS_UNICODE(unicode);
674 for (i = size; i > 0; i--)
675 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000676 }
677#endif
678
679 return (PyObject *)unicode;
680}
681
Mark Dickinson081dfee2009-03-18 14:47:41 +0000682#endif /* CONVERT_WCHAR_TO_SURROGATES */
683
684#undef CONVERT_WCHAR_TO_SURROGATES
685
Walter Dörwald346737f2007-05-31 10:44:43 +0000686static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000687makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
688 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000689{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000690 *fmt++ = '%';
691 if (width) {
692 if (zeropad)
693 *fmt++ = '0';
694 fmt += sprintf(fmt, "%d", width);
695 }
696 if (precision)
697 fmt += sprintf(fmt, ".%d", precision);
698 if (longflag)
699 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000700 else if (longlongflag) {
701 /* longlongflag should only ever be nonzero on machines with
702 HAVE_LONG_LONG defined */
703#ifdef HAVE_LONG_LONG
704 char *f = PY_FORMAT_LONG_LONG;
705 while (*f)
706 *fmt++ = *f++;
707#else
708 /* we shouldn't ever get here */
709 assert(0);
710 *fmt++ = 'l';
711#endif
712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 else if (size_tflag) {
714 char *f = PY_FORMAT_SIZE_T;
715 while (*f)
716 *fmt++ = *f++;
717 }
718 *fmt++ = c;
719 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000720}
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
723
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000724/* size of fixed-size buffer for formatting single arguments */
725#define ITEM_BUFFER_LEN 21
726/* maximum number of characters required for output of %ld. 21 characters
727 allows for 64-bit integers (in decimal) and an optional sign. */
728#define MAX_LONG_CHARS 21
729/* maximum number of characters required for output of %lld.
730 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
731 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
732#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
733
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734PyObject *
735PyUnicode_FromFormatV(const char *format, va_list vargs)
736{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000737 va_list count;
738 Py_ssize_t callcount = 0;
739 PyObject **callresults = NULL;
740 PyObject **callresult = NULL;
741 Py_ssize_t n = 0;
742 int width = 0;
743 int precision = 0;
744 int zeropad;
745 const char* f;
746 Py_UNICODE *s;
747 PyObject *string;
748 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 /* use abuffer instead of buffer, if we need more space
751 * (which can happen if there's a format specifier with width). */
752 char *abuffer = NULL;
753 char *realbuffer;
754 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000755 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000758 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 /* step 1: count the number of %S/%R/%A/%s format specifications
760 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
761 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
762 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000763 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000764 if (*f == '%') {
765 if (*(f+1)=='%')
766 continue;
767 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
768 ++callcount;
769 while (ISDIGIT((unsigned)*f))
770 width = (width*10) + *f++ - '0';
771 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
772 ;
773 if (*f == 's')
774 ++callcount;
775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 }
777 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000778 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000779 if (callcount) {
780 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
781 if (!callresults) {
782 PyErr_NoMemory();
783 return NULL;
784 }
785 callresult = callresults;
786 }
787 /* step 3: figure out how large a buffer we need */
788 for (f = format; *f; f++) {
789 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000790#ifdef HAVE_LONG_LONG
791 int longlongflag = 0;
792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 const char* p = f;
794 width = 0;
795 while (ISDIGIT((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
798 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
801 * they don't affect the amount of space we reserve.
802 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000803 if (*f == 'l') {
804 if (f[1] == 'd' || f[1] == 'u') {
805 ++f;
806 }
807#ifdef HAVE_LONG_LONG
808 else if (f[1] == 'l' &&
809 (f[2] == 'd' || f[2] == 'u')) {
810 longlongflag = 1;
811 f += 2;
812 }
813#endif
814 }
815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000816 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Benjamin Peterson14339b62009-01-31 16:36:08 +0000819 switch (*f) {
820 case 'c':
821 (void)va_arg(count, int);
822 /* fall through... */
823 case '%':
824 n++;
825 break;
826 case 'd': case 'u': case 'i': case 'x':
827 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000828#ifdef HAVE_LONG_LONG
829 if (longlongflag) {
830 if (width < MAX_LONG_LONG_CHARS)
831 width = MAX_LONG_LONG_CHARS;
832 }
833 else
834#endif
835 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
836 including sign. Decimal takes the most space. This
837 isn't enough for octal. If a width is specified we
838 need more (which we allocate later). */
839 if (width < MAX_LONG_CHARS)
840 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000842 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 if (abuffersize < width)
844 abuffersize = width;
845 break;
846 case 's':
847 {
848 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000849 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000850 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
851 if (!str)
852 goto fail;
853 n += PyUnicode_GET_SIZE(str);
854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 break;
857 }
858 case 'U':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 assert(obj && PyUnicode_Check(obj));
862 n += PyUnicode_GET_SIZE(obj);
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(count, PyObject *);
868 const char *str = va_arg(count, const char *);
869 assert(obj || str);
870 assert(!obj || PyUnicode_Check(obj));
871 if (obj)
872 n += PyUnicode_GET_SIZE(obj);
873 else
874 n += strlen(str);
875 break;
876 }
877 case 'S':
878 {
879 PyObject *obj = va_arg(count, PyObject *);
880 PyObject *str;
881 assert(obj);
882 str = PyObject_Str(obj);
883 if (!str)
884 goto fail;
885 n += PyUnicode_GET_SIZE(str);
886 /* Remember the str and switch to the next slot */
887 *callresult++ = str;
888 break;
889 }
890 case 'R':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *repr;
894 assert(obj);
895 repr = PyObject_Repr(obj);
896 if (!repr)
897 goto fail;
898 n += PyUnicode_GET_SIZE(repr);
899 /* Remember the repr and switch to the next slot */
900 *callresult++ = repr;
901 break;
902 }
903 case 'A':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *ascii;
907 assert(obj);
908 ascii = PyObject_ASCII(obj);
909 if (!ascii)
910 goto fail;
911 n += PyUnicode_GET_SIZE(ascii);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = ascii;
914 break;
915 }
916 case 'p':
917 (void) va_arg(count, int);
918 /* maximum 64-bit pointer representation:
919 * 0xffffffffffffffff
920 * so 19 characters is enough.
921 * XXX I count 18 -- what's the extra for?
922 */
923 n += 19;
924 break;
925 default:
926 /* if we stumble upon an unknown
927 formatting code, copy the rest of
928 the format string to the output
929 string. (we cannot just skip the
930 code, since there's no way to know
931 what's in the argument list) */
932 n += strlen(p);
933 goto expand;
934 }
935 } else
936 n++;
937 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000939 if (abuffersize > ITEM_BUFFER_LEN) {
940 /* add 1 for sprintf's trailing null byte */
941 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000942 if (!abuffer) {
943 PyErr_NoMemory();
944 goto fail;
945 }
946 realbuffer = abuffer;
947 }
948 else
949 realbuffer = buffer;
950 /* step 4: fill the buffer */
951 /* Since we've analyzed how much space we need for the worst case,
952 we don't have to resize the string.
953 There can be no errors beyond this point. */
954 string = PyUnicode_FromUnicode(NULL, n);
955 if (!string)
956 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 s = PyUnicode_AS_UNICODE(string);
959 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 for (f = format; *f; f++) {
962 if (*f == '%') {
963 const char* p = f++;
964 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000965 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 int size_tflag = 0;
967 zeropad = (*f == '0');
968 /* parse the width.precision part */
969 width = 0;
970 while (ISDIGIT((unsigned)*f))
971 width = (width*10) + *f++ - '0';
972 precision = 0;
973 if (*f == '.') {
974 f++;
975 while (ISDIGIT((unsigned)*f))
976 precision = (precision*10) + *f++ - '0';
977 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 /* Handle %ld, %lu, %lld and %llu. */
979 if (*f == 'l') {
980 if (f[1] == 'd' || f[1] == 'u') {
981 longflag = 1;
982 ++f;
983 }
984#ifdef HAVE_LONG_LONG
985 else if (f[1] == 'l' &&
986 (f[2] == 'd' || f[2] == 'u')) {
987 longlongflag = 1;
988 f += 2;
989 }
990#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000991 }
992 /* handle the size_t flag. */
993 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
994 size_tflag = 1;
995 ++f;
996 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000997
Benjamin Peterson14339b62009-01-31 16:36:08 +0000998 switch (*f) {
999 case 'c':
1000 *s++ = va_arg(vargs, int);
1001 break;
1002 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1004 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 if (longflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007#ifdef HAVE_LONG_LONG
1008 else if (longlongflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 else if (size_tflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1013 else
1014 sprintf(realbuffer, fmt, va_arg(vargs, int));
1015 appendstring(realbuffer);
1016 break;
1017 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001018 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1019 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 if (longflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022#ifdef HAVE_LONG_LONG
1023 else if (longlongflag)
1024 sprintf(realbuffer, fmt, va_arg(vargs,
1025 unsigned PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001039 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001040 sprintf(realbuffer, fmt, va_arg(vargs, int));
1041 appendstring(realbuffer);
1042 break;
1043 case 's':
1044 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001045 /* unused, since we already have the result */
1046 (void) va_arg(vargs, char *);
1047 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1048 PyUnicode_GET_SIZE(*callresult));
1049 s += PyUnicode_GET_SIZE(*callresult);
1050 /* We're done with the unicode()/repr() => forget it */
1051 Py_DECREF(*callresult);
1052 /* switch to next unicode()/repr() result */
1053 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 break;
1055 }
1056 case 'U':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1060 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1061 s += size;
1062 break;
1063 }
1064 case 'V':
1065 {
1066 PyObject *obj = va_arg(vargs, PyObject *);
1067 const char *str = va_arg(vargs, const char *);
1068 if (obj) {
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 } else {
1073 appendstring(str);
1074 }
1075 break;
1076 }
1077 case 'S':
1078 case 'R':
1079 {
1080 Py_UNICODE *ucopy;
1081 Py_ssize_t usize;
1082 Py_ssize_t upos;
1083 /* unused, since we already have the result */
1084 (void) va_arg(vargs, PyObject *);
1085 ucopy = PyUnicode_AS_UNICODE(*callresult);
1086 usize = PyUnicode_GET_SIZE(*callresult);
1087 for (upos = 0; upos<usize;)
1088 *s++ = ucopy[upos++];
1089 /* We're done with the unicode()/repr() => forget it */
1090 Py_DECREF(*callresult);
1091 /* switch to next unicode()/repr() result */
1092 ++callresult;
1093 break;
1094 }
1095 case 'p':
1096 sprintf(buffer, "%p", va_arg(vargs, void*));
1097 /* %p is ill-defined: ensure leading 0x. */
1098 if (buffer[1] == 'X')
1099 buffer[1] = 'x';
1100 else if (buffer[1] != 'x') {
1101 memmove(buffer+2, buffer, strlen(buffer)+1);
1102 buffer[0] = '0';
1103 buffer[1] = 'x';
1104 }
1105 appendstring(buffer);
1106 break;
1107 case '%':
1108 *s++ = '%';
1109 break;
1110 default:
1111 appendstring(p);
1112 goto end;
1113 }
1114 } else
1115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 wchar_t *w,
1159 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160{
1161 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 PyErr_BadInternalCall();
1163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001165
1166 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170#ifdef HAVE_USABLE_WCHAR_T
1171 memcpy(w, unicode->str, size * sizeof(wchar_t));
1172#else
1173 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 register Py_UNICODE *u;
1175 register Py_ssize_t i;
1176 u = PyUnicode_AS_UNICODE(unicode);
1177 for (i = size; i > 0; i--)
1178 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180#endif
1181
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001182 if (size > PyUnicode_GET_SIZE(unicode))
1183 return PyUnicode_GET_SIZE(unicode);
1184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186}
1187
1188#endif
1189
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190PyObject *PyUnicode_FromOrdinal(int ordinal)
1191{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001192 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 PyErr_SetString(PyExc_ValueError,
1196 "chr() arg not in range(0x110000)");
1197 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001199
1200#ifndef Py_UNICODE_WIDE
1201 if (ordinal > 0xffff) {
1202 ordinal -= 0x10000;
1203 s[0] = 0xD800 | (ordinal >> 10);
1204 s[1] = 0xDC00 | (ordinal & 0x3FF);
1205 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001206 }
1207#endif
1208
Hye-Shik Chang40574832004-04-06 07:24:51 +00001209 s[0] = (Py_UNICODE)ordinal;
1210 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_FromObject(register PyObject *obj)
1214{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001215 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 Py_INCREF(obj);
1219 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001220 }
1221 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 /* For a Unicode subtype that's not a Unicode object,
1223 return a true Unicode object with the same data. */
1224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1225 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001227 PyErr_Format(PyExc_TypeError,
1228 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001229 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001230 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231}
1232
1233PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 const char *encoding,
1235 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001237 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001238 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001239 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001240
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001242 PyErr_BadInternalCall();
1243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001245
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001246 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001247 PyErr_SetString(PyExc_TypeError,
1248 "decoding str is not supported");
1249 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001250 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001251
1252 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001253 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001254 s = PyBytes_AS_STRING(obj);
1255 len = PyBytes_GET_SIZE(obj);
1256 }
1257 else if (PyByteArray_Check(obj)) {
1258 s = PyByteArray_AS_STRING(obj);
1259 len = PyByteArray_GET_SIZE(obj);
1260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 /* Overwrite the error message with something more useful in
1263 case of a TypeError. */
1264 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001265 PyErr_Format(PyExc_TypeError,
Georg Brandl952867a2010-06-27 10:17:12 +00001266 "coercing to str: need bytes, bytearray or char buffer, "
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 "%.80s found",
1268 Py_TYPE(obj)->tp_name);
1269 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001270 }
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001272 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 Py_INCREF(unicode_empty);
1275 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Tim Petersced69f82003-09-16 20:30:58 +00001277 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001279
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001280 return v;
1281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284}
1285
Victor Stinner600d3be2010-06-10 12:00:55 +00001286/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001287 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1288 1 on success. */
1289static int
1290normalize_encoding(const char *encoding,
1291 char *lower,
1292 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001294 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001295 char *l;
1296 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001297
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001298 e = encoding;
1299 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001300 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001301 while (*e) {
1302 if (l == l_end)
1303 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 if (ISUPPER(*e)) {
1305 *l++ = TOLOWER(*e++);
1306 }
1307 else if (*e == '_') {
1308 *l++ = '-';
1309 e++;
1310 }
1311 else {
1312 *l++ = *e++;
1313 }
1314 }
1315 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001316 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001317}
1318
1319PyObject *PyUnicode_Decode(const char *s,
1320 Py_ssize_t size,
1321 const char *encoding,
1322 const char *errors)
1323{
1324 PyObject *buffer = NULL, *unicode;
1325 Py_buffer info;
1326 char lower[11]; /* Enough for any encoding shortcut */
1327
1328 if (encoding == NULL)
1329 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001330
1331 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001332 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1333 if (strcmp(lower, "utf-8") == 0)
1334 return PyUnicode_DecodeUTF8(s, size, errors);
1335 else if ((strcmp(lower, "latin-1") == 0) ||
1336 (strcmp(lower, "iso-8859-1") == 0))
1337 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001339 else if (strcmp(lower, "mbcs") == 0)
1340 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001341#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001342 else if (strcmp(lower, "ascii") == 0)
1343 return PyUnicode_DecodeASCII(s, size, errors);
1344 else if (strcmp(lower, "utf-16") == 0)
1345 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1346 else if (strcmp(lower, "utf-32") == 0)
1347 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
1350 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001351 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001352 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001353 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001354 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 if (buffer == NULL)
1356 goto onError;
1357 unicode = PyCodec_Decode(buffer, encoding, errors);
1358 if (unicode == NULL)
1359 goto onError;
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001362 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 Py_DECREF(unicode);
1365 goto onError;
1366 }
1367 Py_DECREF(buffer);
1368 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001369
Benjamin Peterson29060642009-01-31 22:14:21 +00001370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 Py_XDECREF(buffer);
1372 return NULL;
1373}
1374
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1376 const char *encoding,
1377 const char *errors)
1378{
1379 PyObject *v;
1380
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385
1386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001388
1389 /* Decode via the codec registry */
1390 v = PyCodec_Decode(unicode, encoding, errors);
1391 if (v == NULL)
1392 goto onError;
1393 return v;
1394
Benjamin Peterson29060642009-01-31 22:14:21 +00001395 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001396 return NULL;
1397}
1398
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001411 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001412
1413 /* Decode via the codec registry */
1414 v = PyCodec_Decode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
1424 return v;
1425
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001427 return NULL;
1428}
1429
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 Py_ssize_t size,
1432 const char *encoding,
1433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434{
1435 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001436
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 unicode = PyUnicode_FromUnicode(s, size);
1438 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1441 Py_DECREF(unicode);
1442 return v;
1443}
1444
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001445PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1446 const char *encoding,
1447 const char *errors)
1448{
1449 PyObject *v;
1450
1451 if (!PyUnicode_Check(unicode)) {
1452 PyErr_BadArgument();
1453 goto onError;
1454 }
1455
1456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001458
1459 /* Encode via the codec registry */
1460 v = PyCodec_Encode(unicode, encoding, errors);
1461 if (v == NULL)
1462 goto onError;
1463 return v;
1464
Benjamin Peterson29060642009-01-31 22:14:21 +00001465 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001466 return NULL;
1467}
1468
Victor Stinnerae6265f2010-05-15 16:27:27 +00001469PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1470{
Victor Stinner313a1202010-06-11 23:56:51 +00001471 if (Py_FileSystemDefaultEncoding) {
1472#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1473 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1474 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode),
1476 NULL);
1477#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001478 return PyUnicode_AsEncodedString(unicode,
1479 Py_FileSystemDefaultEncoding,
1480 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001481 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001482 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001483 PyUnicode_GET_SIZE(unicode),
1484 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001485}
1486
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1488 const char *encoding,
1489 const char *errors)
1490{
1491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001493
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 if (!PyUnicode_Check(unicode)) {
1495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 }
Fred Drakee4315f52000-05-09 19:53:39 +00001498
Tim Petersced69f82003-09-16 20:30:58 +00001499 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001501
1502 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001503 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1504 if (strcmp(lower, "utf-8") == 0)
1505 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1506 PyUnicode_GET_SIZE(unicode),
1507 errors);
1508 else if ((strcmp(lower, "latin-1") == 0) ||
1509 (strcmp(lower, "iso-8859-1") == 0))
1510 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1511 PyUnicode_GET_SIZE(unicode),
1512 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001513#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001514 else if (strcmp(lower, "mbcs") == 0)
1515 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001518#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001519 else if (strcmp(lower, "ascii") == 0)
1520 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
1523 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001524 /* During bootstrap, we may need to find the encodings
1525 package, to load the file system encoding, and require the
1526 file system encoding in order to load the encodings
1527 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001528
Victor Stinner59e62db2010-05-15 13:14:32 +00001529 Break out of this dependency by assuming that the path to
1530 the encodings module is ASCII-only. XXX could try wcstombs
1531 instead, if the file system encoding is the locale's
1532 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001533 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1535 !PyThreadState_GET()->interp->codecs_initialized)
1536 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1537 PyUnicode_GET_SIZE(unicode),
1538 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539
1540 /* Encode via the codec registry */
1541 v = PyCodec_Encode(unicode, encoding, errors);
1542 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001543 return NULL;
1544
1545 /* The normal path */
1546 if (PyBytes_Check(v))
1547 return v;
1548
1549 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001550 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001551 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001552 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001553
1554 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1555 "encoder %s returned bytearray instead of bytes",
1556 encoding);
1557 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001558 Py_DECREF(v);
1559 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001561
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001562 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1563 Py_DECREF(v);
1564 return b;
1565 }
1566
1567 PyErr_Format(PyExc_TypeError,
1568 "encoder did not return a bytes object (type=%.400s)",
1569 Py_TYPE(v)->tp_name);
1570 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001571 return NULL;
1572}
1573
1574PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1575 const char *encoding,
1576 const char *errors)
1577{
1578 PyObject *v;
1579
1580 if (!PyUnicode_Check(unicode)) {
1581 PyErr_BadArgument();
1582 goto onError;
1583 }
1584
1585 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001587
1588 /* Encode via the codec registry */
1589 v = PyCodec_Encode(unicode, encoding, errors);
1590 if (v == NULL)
1591 goto onError;
1592 if (!PyUnicode_Check(v)) {
1593 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001594 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001595 Py_TYPE(v)->tp_name);
1596 Py_DECREF(v);
1597 goto onError;
1598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001600
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 return NULL;
1603}
1604
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001605PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001606 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001607{
1608 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001609 if (v)
1610 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001611 if (errors != NULL)
1612 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001613 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001614 PyUnicode_GET_SIZE(unicode),
1615 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001616 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001617 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001618 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001619 return v;
1620}
1621
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001622PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001623PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001624 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001625 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1626}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001627
Christian Heimes5894ba72007-11-04 11:43:14 +00001628PyObject*
1629PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1630{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001631 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1632 can be undefined. If it is case, decode using UTF-8. The following assumes
1633 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1634 bootstrapping process where the codecs aren't ready yet.
1635 */
1636 if (Py_FileSystemDefaultEncoding) {
1637#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001638 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001639 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001640 }
1641#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001642 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001643 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001644 }
1645#endif
1646 return PyUnicode_Decode(s, size,
1647 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001648 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001651 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001652 }
1653}
1654
Martin v. Löwis011e8422009-05-05 04:43:17 +00001655
1656int
1657PyUnicode_FSConverter(PyObject* arg, void* addr)
1658{
1659 PyObject *output = NULL;
1660 Py_ssize_t size;
1661 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001662 if (arg == NULL) {
1663 Py_DECREF(*(PyObject**)addr);
1664 return 1;
1665 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001666 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001667 output = arg;
1668 Py_INCREF(output);
1669 }
1670 else {
1671 arg = PyUnicode_FromObject(arg);
1672 if (!arg)
1673 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001674 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001675 Py_DECREF(arg);
1676 if (!output)
1677 return 0;
1678 if (!PyBytes_Check(output)) {
1679 Py_DECREF(output);
1680 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1681 return 0;
1682 }
1683 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001684 size = PyBytes_GET_SIZE(output);
1685 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001686 if (size != strlen(data)) {
1687 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1688 Py_DECREF(output);
1689 return 0;
1690 }
1691 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001692 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001693}
1694
1695
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001696int
1697PyUnicode_FSDecoder(PyObject* arg, void* addr)
1698{
1699 PyObject *output = NULL;
1700 Py_ssize_t size;
1701 void *data;
1702 if (arg == NULL) {
1703 Py_DECREF(*(PyObject**)addr);
1704 return 1;
1705 }
1706 if (PyUnicode_Check(arg)) {
1707 output = arg;
1708 Py_INCREF(output);
1709 }
1710 else {
1711 arg = PyBytes_FromObject(arg);
1712 if (!arg)
1713 return 0;
1714 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1715 PyBytes_GET_SIZE(arg));
1716 Py_DECREF(arg);
1717 if (!output)
1718 return 0;
1719 if (!PyUnicode_Check(output)) {
1720 Py_DECREF(output);
1721 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1722 return 0;
1723 }
1724 }
1725 size = PyUnicode_GET_SIZE(output);
1726 data = PyUnicode_AS_UNICODE(output);
1727 if (size != Py_UNICODE_strlen(data)) {
1728 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1729 Py_DECREF(output);
1730 return 0;
1731 }
1732 *(PyObject**)addr = output;
1733 return Py_CLEANUP_SUPPORTED;
1734}
1735
1736
Martin v. Löwis5b222132007-06-10 09:51:05 +00001737char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001738_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001739{
Christian Heimesf3863112007-11-22 07:46:41 +00001740 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001741 if (!PyUnicode_Check(unicode)) {
1742 PyErr_BadArgument();
1743 return NULL;
1744 }
Christian Heimesf3863112007-11-22 07:46:41 +00001745 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1746 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001747 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001748 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001749 *psize = PyBytes_GET_SIZE(bytes);
1750 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001751}
1752
1753char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001754_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001755{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001756 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001757}
1758
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1760{
1761 if (!PyUnicode_Check(unicode)) {
1762 PyErr_BadArgument();
1763 goto onError;
1764 }
1765 return PyUnicode_AS_UNICODE(unicode);
1766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 return NULL;
1769}
1770
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772{
1773 if (!PyUnicode_Check(unicode)) {
1774 PyErr_BadArgument();
1775 goto onError;
1776 }
1777 return PyUnicode_GET_SIZE(unicode);
1778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 return -1;
1781}
1782
Thomas Wouters78890102000-07-22 19:25:51 +00001783const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001784{
1785 return unicode_default_encoding;
1786}
1787
1788int PyUnicode_SetDefaultEncoding(const char *encoding)
1789{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001790 if (strcmp(encoding, unicode_default_encoding) != 0) {
1791 PyErr_Format(PyExc_ValueError,
1792 "Can only set default encoding to %s",
1793 unicode_default_encoding);
1794 return -1;
1795 }
Fred Drakee4315f52000-05-09 19:53:39 +00001796 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001797}
1798
Victor Stinner554f3f02010-06-16 23:33:54 +00001799/* create or adjust a UnicodeDecodeError */
1800static void
1801make_decode_exception(PyObject **exceptionObject,
1802 const char *encoding,
1803 const char *input, Py_ssize_t length,
1804 Py_ssize_t startpos, Py_ssize_t endpos,
1805 const char *reason)
1806{
1807 if (*exceptionObject == NULL) {
1808 *exceptionObject = PyUnicodeDecodeError_Create(
1809 encoding, input, length, startpos, endpos, reason);
1810 }
1811 else {
1812 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1813 goto onError;
1814 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1815 goto onError;
1816 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1817 goto onError;
1818 }
1819 return;
1820
1821onError:
1822 Py_DECREF(*exceptionObject);
1823 *exceptionObject = NULL;
1824}
1825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826/* error handling callback helper:
1827 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001828 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 and adjust various state variables.
1830 return 0 on success, -1 on error
1831*/
1832
1833static
1834int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 const char *encoding, const char *reason,
1836 const char **input, const char **inend, Py_ssize_t *startinpos,
1837 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1838 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001840 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841
1842 PyObject *restuple = NULL;
1843 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001844 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001845 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001846 Py_ssize_t requiredsize;
1847 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001849 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001850 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 int res = -1;
1852
1853 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 *errorHandler = PyCodec_LookupError(errors);
1855 if (*errorHandler == NULL)
1856 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 }
1858
Victor Stinner554f3f02010-06-16 23:33:54 +00001859 make_decode_exception(exceptionObject,
1860 encoding,
1861 *input, *inend - *input,
1862 *startinpos, *endinpos,
1863 reason);
1864 if (*exceptionObject == NULL)
1865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866
1867 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1868 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001869 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001871 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 }
1874 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001875 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001876
1877 /* Copy back the bytes variables, which might have been modified by the
1878 callback */
1879 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1880 if (!inputobj)
1881 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001882 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001884 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001885 *input = PyBytes_AS_STRING(inputobj);
1886 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001887 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001888 /* we can DECREF safely, as the exception has another reference,
1889 so the object won't go away. */
1890 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001891
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001894 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001895 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1896 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001897 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898
1899 /* need more space? (at least enough for what we
1900 have+the replacement+the rest of the string (starting
1901 at the new input position), so we won't have to check space
1902 when there are no errors in the rest of the string) */
1903 repptr = PyUnicode_AS_UNICODE(repunicode);
1904 repsize = PyUnicode_GET_SIZE(repunicode);
1905 requiredsize = *outpos + repsize + insize-newpos;
1906 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 if (requiredsize<2*outsize)
1908 requiredsize = 2*outsize;
1909 if (_PyUnicode_Resize(output, requiredsize) < 0)
1910 goto onError;
1911 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001912 }
1913 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001914 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 Py_UNICODE_COPY(*outptr, repptr, repsize);
1916 *outptr += repsize;
1917 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 /* we made it! */
1920 res = 0;
1921
Benjamin Peterson29060642009-01-31 22:14:21 +00001922 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001923 Py_XDECREF(restuple);
1924 return res;
1925}
1926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001927/* --- UTF-7 Codec -------------------------------------------------------- */
1928
Antoine Pitrou244651a2009-05-04 18:56:13 +00001929/* See RFC2152 for details. We encode conservatively and decode liberally. */
1930
1931/* Three simple macros defining base-64. */
1932
1933/* Is c a base-64 character? */
1934
1935#define IS_BASE64(c) \
1936 (((c) >= 'A' && (c) <= 'Z') || \
1937 ((c) >= 'a' && (c) <= 'z') || \
1938 ((c) >= '0' && (c) <= '9') || \
1939 (c) == '+' || (c) == '/')
1940
1941/* given that c is a base-64 character, what is its base-64 value? */
1942
1943#define FROM_BASE64(c) \
1944 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1945 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1946 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1947 (c) == '+' ? 62 : 63)
1948
1949/* What is the base-64 character of the bottom 6 bits of n? */
1950
1951#define TO_BASE64(n) \
1952 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1953
1954/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1955 * decoded as itself. We are permissive on decoding; the only ASCII
1956 * byte not decoding to itself is the + which begins a base64
1957 * string. */
1958
1959#define DECODE_DIRECT(c) \
1960 ((c) <= 127 && (c) != '+')
1961
1962/* The UTF-7 encoder treats ASCII characters differently according to
1963 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1964 * the above). See RFC2152. This array identifies these different
1965 * sets:
1966 * 0 : "Set D"
1967 * alphanumeric and '(),-./:?
1968 * 1 : "Set O"
1969 * !"#$%&*;<=>@[]^_`{|}
1970 * 2 : "whitespace"
1971 * ht nl cr sp
1972 * 3 : special (must be base64 encoded)
1973 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1974 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001975
Tim Petersced69f82003-09-16 20:30:58 +00001976static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001977char utf7_category[128] = {
1978/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1979 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1980/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1981 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1982/* sp ! " # $ % & ' ( ) * + , - . / */
1983 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1984/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1985 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1986/* @ A B C D E F G H I J K L M N O */
1987 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1990/* ` a b c d e f g h i j k l m n o */
1991 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1992/* p q r s t u v w x y z { | } ~ del */
1993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994};
1995
Antoine Pitrou244651a2009-05-04 18:56:13 +00001996/* ENCODE_DIRECT: this character should be encoded as itself. The
1997 * answer depends on whether we are encoding set O as itself, and also
1998 * on whether we are encoding whitespace as itself. RFC2152 makes it
1999 * clear that the answers to these questions vary between
2000 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002001
Antoine Pitrou244651a2009-05-04 18:56:13 +00002002#define ENCODE_DIRECT(c, directO, directWS) \
2003 ((c) < 128 && (c) > 0 && \
2004 ((utf7_category[(c)] == 0) || \
2005 (directWS && (utf7_category[(c)] == 2)) || \
2006 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002007
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002008PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 Py_ssize_t size,
2010 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002012 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2013}
2014
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015/* The decoder. The only state we preserve is our read position,
2016 * i.e. how many characters we have consumed. So if we end in the
2017 * middle of a shift sequence we have to back off the read position
2018 * and the output to the beginning of the sequence, otherwise we lose
2019 * all the shift state (seen bits, number of bits seen, high
2020 * surrogate). */
2021
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002022PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002023 Py_ssize_t size,
2024 const char *errors,
2025 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002026{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002028 Py_ssize_t startinpos;
2029 Py_ssize_t endinpos;
2030 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002031 const char *e;
2032 PyUnicodeObject *unicode;
2033 Py_UNICODE *p;
2034 const char *errmsg = "";
2035 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002036 Py_UNICODE *shiftOutStart;
2037 unsigned int base64bits = 0;
2038 unsigned long base64buffer = 0;
2039 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 PyObject *errorHandler = NULL;
2041 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002042
2043 unicode = _PyUnicode_New(size);
2044 if (!unicode)
2045 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002046 if (size == 0) {
2047 if (consumed)
2048 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002050 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002051
2052 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002053 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002054 e = s + size;
2055
2056 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002059 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 if (inShift) { /* in a base-64 section */
2062 if (IS_BASE64(ch)) { /* consume a base-64 character */
2063 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2064 base64bits += 6;
2065 s++;
2066 if (base64bits >= 16) {
2067 /* we have enough bits for a UTF-16 value */
2068 Py_UNICODE outCh = (Py_UNICODE)
2069 (base64buffer >> (base64bits-16));
2070 base64bits -= 16;
2071 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2072 if (surrogate) {
2073 /* expecting a second surrogate */
2074 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2075#ifdef Py_UNICODE_WIDE
2076 *p++ = (((surrogate & 0x3FF)<<10)
2077 | (outCh & 0x3FF)) + 0x10000;
2078#else
2079 *p++ = surrogate;
2080 *p++ = outCh;
2081#endif
2082 surrogate = 0;
2083 }
2084 else {
2085 surrogate = 0;
2086 errmsg = "second surrogate missing";
2087 goto utf7Error;
2088 }
2089 }
2090 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2091 /* first surrogate */
2092 surrogate = outCh;
2093 }
2094 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2095 errmsg = "unexpected second surrogate";
2096 goto utf7Error;
2097 }
2098 else {
2099 *p++ = outCh;
2100 }
2101 }
2102 }
2103 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002104 inShift = 0;
2105 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002106 if (surrogate) {
2107 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002108 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002109 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002110 if (base64bits > 0) { /* left-over bits */
2111 if (base64bits >= 6) {
2112 /* We've seen at least one base-64 character */
2113 errmsg = "partial character in shift sequence";
2114 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116 else {
2117 /* Some bits remain; they should be zero */
2118 if (base64buffer != 0) {
2119 errmsg = "non-zero padding bits in shift sequence";
2120 goto utf7Error;
2121 }
2122 }
2123 }
2124 if (ch != '-') {
2125 /* '-' is absorbed; other terminating
2126 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002127 *p++ = ch;
2128 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 }
2130 }
2131 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133 s++; /* consume '+' */
2134 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 s++;
2136 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 }
2138 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140 shiftOutStart = p;
2141 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142 }
2143 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002144 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145 *p++ = ch;
2146 s++;
2147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148 else {
2149 startinpos = s-starts;
2150 s++;
2151 errmsg = "unexpected special character";
2152 goto utf7Error;
2153 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002154 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002155utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 outpos = p-PyUnicode_AS_UNICODE(unicode);
2157 endinpos = s-starts;
2158 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002159 errors, &errorHandler,
2160 "utf7", errmsg,
2161 &starts, &e, &startinpos, &endinpos, &exc, &s,
2162 &unicode, &outpos, &p))
2163 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 }
2165
Antoine Pitrou244651a2009-05-04 18:56:13 +00002166 /* end of string */
2167
2168 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2169 /* if we're in an inconsistent state, that's an error */
2170 if (surrogate ||
2171 (base64bits >= 6) ||
2172 (base64bits > 0 && base64buffer != 0)) {
2173 outpos = p-PyUnicode_AS_UNICODE(unicode);
2174 endinpos = size;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "utf7", "unterminated shift sequence",
2178 &starts, &e, &startinpos, &endinpos, &exc, &s,
2179 &unicode, &outpos, &p))
2180 goto onError;
2181 if (s < e)
2182 goto restart;
2183 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002184 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002185
2186 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002187 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002188 if (inShift) {
2189 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002190 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002191 }
2192 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002193 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002194 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002195 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002197 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198 goto onError;
2199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 Py_XDECREF(errorHandler);
2201 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002202 return (PyObject *)unicode;
2203
Benjamin Peterson29060642009-01-31 22:14:21 +00002204 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002205 Py_XDECREF(errorHandler);
2206 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207 Py_DECREF(unicode);
2208 return NULL;
2209}
2210
2211
2212PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002214 int base64SetO,
2215 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002216 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002218 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002219 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002220 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002221 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002222 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223 unsigned int base64bits = 0;
2224 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002225 char * out;
2226 char * start;
2227
2228 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002229 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002230
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002231 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002232 return PyErr_NoMemory();
2233
Antoine Pitrou244651a2009-05-04 18:56:13 +00002234 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002235 if (v == NULL)
2236 return NULL;
2237
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002238 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002239 for (;i < size; ++i) {
2240 Py_UNICODE ch = s[i];
2241
Antoine Pitrou244651a2009-05-04 18:56:13 +00002242 if (inShift) {
2243 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2244 /* shifting out */
2245 if (base64bits) { /* output remaining bits */
2246 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2247 base64buffer = 0;
2248 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002249 }
2250 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002251 /* Characters not in the BASE64 set implicitly unshift the sequence
2252 so no '-' is required, except if the character is itself a '-' */
2253 if (IS_BASE64(ch) || ch == '-') {
2254 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002255 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002256 *out++ = (char) ch;
2257 }
2258 else {
2259 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002260 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002261 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002262 else { /* not in a shift sequence */
2263 if (ch == '+') {
2264 *out++ = '+';
2265 *out++ = '-';
2266 }
2267 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2268 *out++ = (char) ch;
2269 }
2270 else {
2271 *out++ = '+';
2272 inShift = 1;
2273 goto encode_char;
2274 }
2275 }
2276 continue;
2277encode_char:
2278#ifdef Py_UNICODE_WIDE
2279 if (ch >= 0x10000) {
2280 /* code first surrogate */
2281 base64bits += 16;
2282 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2283 while (base64bits >= 6) {
2284 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2285 base64bits -= 6;
2286 }
2287 /* prepare second surrogate */
2288 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2289 }
2290#endif
2291 base64bits += 16;
2292 base64buffer = (base64buffer << 16) | ch;
2293 while (base64bits >= 6) {
2294 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2295 base64bits -= 6;
2296 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002297 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002298 if (base64bits)
2299 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2300 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002301 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002302 if (_PyBytes_Resize(&v, out - start) < 0)
2303 return NULL;
2304 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002305}
2306
Antoine Pitrou244651a2009-05-04 18:56:13 +00002307#undef IS_BASE64
2308#undef FROM_BASE64
2309#undef TO_BASE64
2310#undef DECODE_DIRECT
2311#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002312
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313/* --- UTF-8 Codec -------------------------------------------------------- */
2314
Tim Petersced69f82003-09-16 20:30:58 +00002315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002317 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2318 illegal prefix. See RFC 3629 for details */
2319 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2320 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002321 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2323 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2324 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2325 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002326 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2331 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2332 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2333 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2334 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335};
2336
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002338 Py_ssize_t size,
2339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340{
Walter Dörwald69652032004-09-07 20:24:22 +00002341 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2342}
2343
Antoine Pitrouab868312009-01-10 15:40:25 +00002344/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2345#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2346
2347/* Mask to quickly check whether a C 'long' contains a
2348 non-ASCII, UTF8-encoded char. */
2349#if (SIZEOF_LONG == 8)
2350# define ASCII_CHAR_MASK 0x8080808080808080L
2351#elif (SIZEOF_LONG == 4)
2352# define ASCII_CHAR_MASK 0x80808080L
2353#else
2354# error C 'long' size should be either 4 or 8!
2355#endif
2356
Walter Dörwald69652032004-09-07 20:24:22 +00002357PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 Py_ssize_t size,
2359 const char *errors,
2360 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002361{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002364 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002365 Py_ssize_t startinpos;
2366 Py_ssize_t endinpos;
2367 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002368 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 PyUnicodeObject *unicode;
2370 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002371 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 PyObject *errorHandler = NULL;
2373 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 /* Note: size will always be longer than the resulting Unicode
2376 character count */
2377 unicode = _PyUnicode_New(size);
2378 if (!unicode)
2379 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002380 if (size == 0) {
2381 if (consumed)
2382 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385
2386 /* Unpack UTF-8 encoded data */
2387 p = unicode->str;
2388 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002392 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393
2394 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002395 /* Fast path for runs of ASCII characters. Given that common UTF-8
2396 input will consist of an overwhelming majority of ASCII
2397 characters, we try to optimize for this case by checking
2398 as many characters as a C 'long' can contain.
2399 First, check if we can do an aligned read, as most CPUs have
2400 a penalty for unaligned reads.
2401 */
2402 if (!((size_t) s & LONG_PTR_MASK)) {
2403 /* Help register allocation */
2404 register const char *_s = s;
2405 register Py_UNICODE *_p = p;
2406 while (_s < aligned_end) {
2407 /* Read a whole long at a time (either 4 or 8 bytes),
2408 and do a fast unrolled copy if it only contains ASCII
2409 characters. */
2410 unsigned long data = *(unsigned long *) _s;
2411 if (data & ASCII_CHAR_MASK)
2412 break;
2413 _p[0] = (unsigned char) _s[0];
2414 _p[1] = (unsigned char) _s[1];
2415 _p[2] = (unsigned char) _s[2];
2416 _p[3] = (unsigned char) _s[3];
2417#if (SIZEOF_LONG == 8)
2418 _p[4] = (unsigned char) _s[4];
2419 _p[5] = (unsigned char) _s[5];
2420 _p[6] = (unsigned char) _s[6];
2421 _p[7] = (unsigned char) _s[7];
2422#endif
2423 _s += SIZEOF_LONG;
2424 _p += SIZEOF_LONG;
2425 }
2426 s = _s;
2427 p = _p;
2428 if (s == e)
2429 break;
2430 ch = (unsigned char)*s;
2431 }
2432 }
2433
2434 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002435 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 s++;
2437 continue;
2438 }
2439
2440 n = utf8_code_length[ch];
2441
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002442 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002443 if (consumed)
2444 break;
2445 else {
2446 errmsg = "unexpected end of data";
2447 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002448 endinpos = startinpos+1;
2449 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2450 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002451 goto utf8Error;
2452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454
2455 switch (n) {
2456
2457 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002458 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 startinpos = s-starts;
2460 endinpos = startinpos+1;
2461 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462
2463 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002464 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 startinpos = s-starts;
2466 endinpos = startinpos+1;
2467 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468
2469 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002470 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002471 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002473 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002474 goto utf8Error;
2475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002477 assert ((ch > 0x007F) && (ch <= 0x07FF));
2478 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 break;
2480
2481 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002482 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2483 will result in surrogates in range d800-dfff. Surrogates are
2484 not valid UTF-8 so they are rejected.
2485 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2486 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002487 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002488 (s[2] & 0xc0) != 0x80 ||
2489 ((unsigned char)s[0] == 0xE0 &&
2490 (unsigned char)s[1] < 0xA0) ||
2491 ((unsigned char)s[0] == 0xED &&
2492 (unsigned char)s[1] > 0x9F)) {
2493 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002495 endinpos = startinpos + 1;
2496
2497 /* if s[1] first two bits are 1 and 0, then the invalid
2498 continuation byte is s[2], so increment endinpos by 1,
2499 if not, s[1] is invalid and endinpos doesn't need to
2500 be incremented. */
2501 if ((s[1] & 0xC0) == 0x80)
2502 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 goto utf8Error;
2504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002506 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2507 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002508 break;
2509
2510 case 4:
2511 if ((s[1] & 0xc0) != 0x80 ||
2512 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002513 (s[3] & 0xc0) != 0x80 ||
2514 ((unsigned char)s[0] == 0xF0 &&
2515 (unsigned char)s[1] < 0x90) ||
2516 ((unsigned char)s[0] == 0xF4 &&
2517 (unsigned char)s[1] > 0x8F)) {
2518 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002519 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002520 endinpos = startinpos + 1;
2521 if ((s[1] & 0xC0) == 0x80) {
2522 endinpos++;
2523 if ((s[2] & 0xC0) == 0x80)
2524 endinpos++;
2525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002526 goto utf8Error;
2527 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002528 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002529 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2530 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2531
Fredrik Lundh8f455852001-06-27 18:59:43 +00002532#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002534#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002535 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002536
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002537 /* translate from 10000..10FFFF to 0..FFFF */
2538 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002539
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002540 /* high surrogate = top 10 bits added to D800 */
2541 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002542
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002543 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002544 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002545#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 }
2548 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002549 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002550
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 utf8Error:
2552 outpos = p-PyUnicode_AS_UNICODE(unicode);
2553 if (unicode_decode_call_errorhandler(
2554 errors, &errorHandler,
2555 "utf8", errmsg,
2556 &starts, &e, &startinpos, &endinpos, &exc, &s,
2557 &unicode, &outpos, &p))
2558 goto onError;
2559 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 }
Walter Dörwald69652032004-09-07 20:24:22 +00002561 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002562 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563
2564 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002565 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 goto onError;
2567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568 Py_XDECREF(errorHandler);
2569 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 return (PyObject *)unicode;
2571
Benjamin Peterson29060642009-01-31 22:14:21 +00002572 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 Py_XDECREF(errorHandler);
2574 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 Py_DECREF(unicode);
2576 return NULL;
2577}
2578
Antoine Pitrouab868312009-01-10 15:40:25 +00002579#undef ASCII_CHAR_MASK
2580
2581
Tim Peters602f7402002-04-27 18:03:26 +00002582/* Allocation strategy: if the string is short, convert into a stack buffer
2583 and allocate exactly as much space needed at the end. Else allocate the
2584 maximum possible needed (4 result bytes per Unicode character), and return
2585 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002586*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002587PyObject *
2588PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 Py_ssize_t size,
2590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591{
Tim Peters602f7402002-04-27 18:03:26 +00002592#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002593
Guido van Rossum98297ee2007-11-06 21:34:58 +00002594 Py_ssize_t i; /* index into s of next input byte */
2595 PyObject *result; /* result string object */
2596 char *p; /* next free byte in output buffer */
2597 Py_ssize_t nallocated; /* number of result bytes allocated */
2598 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002599 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002600 PyObject *errorHandler = NULL;
2601 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002602
Tim Peters602f7402002-04-27 18:03:26 +00002603 assert(s != NULL);
2604 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
Tim Peters602f7402002-04-27 18:03:26 +00002606 if (size <= MAX_SHORT_UNICHARS) {
2607 /* Write into the stack buffer; nallocated can't overflow.
2608 * At the end, we'll allocate exactly as much heap space as it
2609 * turns out we need.
2610 */
2611 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002612 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002613 p = stackbuf;
2614 }
2615 else {
2616 /* Overallocate on the heap, and give the excess back at the end. */
2617 nallocated = size * 4;
2618 if (nallocated / 4 != size) /* overflow! */
2619 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002620 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002621 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002622 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002623 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002624 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002625
Tim Peters602f7402002-04-27 18:03:26 +00002626 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002627 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002628
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002629 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002630 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002634 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002635 *p++ = (char)(0xc0 | (ch >> 6));
2636 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002637 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002638#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002639 /* Special case: check for high and low surrogate */
2640 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2641 Py_UCS4 ch2 = s[i];
2642 /* Combine the two surrogates to form a UCS4 value */
2643 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2644 i++;
2645
2646 /* Encode UCS4 Unicode ordinals */
2647 *p++ = (char)(0xf0 | (ch >> 18));
2648 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002649 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2650 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002651 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002652#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002653 Py_ssize_t newpos;
2654 PyObject *rep;
2655 Py_ssize_t repsize, k;
2656 rep = unicode_encode_call_errorhandler
2657 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2658 s, size, &exc, i-1, i, &newpos);
2659 if (!rep)
2660 goto error;
2661
2662 if (PyBytes_Check(rep))
2663 repsize = PyBytes_GET_SIZE(rep);
2664 else
2665 repsize = PyUnicode_GET_SIZE(rep);
2666
2667 if (repsize > 4) {
2668 Py_ssize_t offset;
2669
2670 if (result == NULL)
2671 offset = p - stackbuf;
2672 else
2673 offset = p - PyBytes_AS_STRING(result);
2674
2675 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2676 /* integer overflow */
2677 PyErr_NoMemory();
2678 goto error;
2679 }
2680 nallocated += repsize - 4;
2681 if (result != NULL) {
2682 if (_PyBytes_Resize(&result, nallocated) < 0)
2683 goto error;
2684 } else {
2685 result = PyBytes_FromStringAndSize(NULL, nallocated);
2686 if (result == NULL)
2687 goto error;
2688 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2689 }
2690 p = PyBytes_AS_STRING(result) + offset;
2691 }
2692
2693 if (PyBytes_Check(rep)) {
2694 char *prep = PyBytes_AS_STRING(rep);
2695 for(k = repsize; k > 0; k--)
2696 *p++ = *prep++;
2697 } else /* rep is unicode */ {
2698 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2699 Py_UNICODE c;
2700
2701 for(k=0; k<repsize; k++) {
2702 c = prep[k];
2703 if (0x80 <= c) {
2704 raise_encode_exception(&exc, "utf-8", s, size,
2705 i-1, i, "surrogates not allowed");
2706 goto error;
2707 }
2708 *p++ = (char)prep[k];
2709 }
2710 }
2711 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002712#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002713 }
Victor Stinner445a6232010-04-22 20:01:57 +00002714#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002715 } else if (ch < 0x10000) {
2716 *p++ = (char)(0xe0 | (ch >> 12));
2717 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2718 *p++ = (char)(0x80 | (ch & 0x3f));
2719 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002720 /* Encode UCS4 Unicode ordinals */
2721 *p++ = (char)(0xf0 | (ch >> 18));
2722 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2723 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2724 *p++ = (char)(0x80 | (ch & 0x3f));
2725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002727
Guido van Rossum98297ee2007-11-06 21:34:58 +00002728 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002729 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002730 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002731 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002732 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002733 }
2734 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002735 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002736 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002737 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002738 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002739 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002740 Py_XDECREF(errorHandler);
2741 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002742 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002743 error:
2744 Py_XDECREF(errorHandler);
2745 Py_XDECREF(exc);
2746 Py_XDECREF(result);
2747 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002748
Tim Peters602f7402002-04-27 18:03:26 +00002749#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750}
2751
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 if (!PyUnicode_Check(unicode)) {
2755 PyErr_BadArgument();
2756 return NULL;
2757 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002758 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 PyUnicode_GET_SIZE(unicode),
2760 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761}
2762
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763/* --- UTF-32 Codec ------------------------------------------------------- */
2764
2765PyObject *
2766PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 Py_ssize_t size,
2768 const char *errors,
2769 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002770{
2771 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2772}
2773
2774PyObject *
2775PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 Py_ssize_t size,
2777 const char *errors,
2778 int *byteorder,
2779 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780{
2781 const char *starts = s;
2782 Py_ssize_t startinpos;
2783 Py_ssize_t endinpos;
2784 Py_ssize_t outpos;
2785 PyUnicodeObject *unicode;
2786 Py_UNICODE *p;
2787#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002788 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002789 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790#else
2791 const int pairs = 0;
2792#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002793 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002794 int bo = 0; /* assume native ordering by default */
2795 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002796 /* Offsets from q for retrieving bytes in the right order. */
2797#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2798 int iorder[] = {0, 1, 2, 3};
2799#else
2800 int iorder[] = {3, 2, 1, 0};
2801#endif
2802 PyObject *errorHandler = NULL;
2803 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002804
Walter Dörwald41980ca2007-08-16 21:55:45 +00002805 q = (unsigned char *)s;
2806 e = q + size;
2807
2808 if (byteorder)
2809 bo = *byteorder;
2810
2811 /* Check for BOM marks (U+FEFF) in the input and adjust current
2812 byte order setting accordingly. In native mode, the leading BOM
2813 mark is skipped, in all other modes, it is copied to the output
2814 stream as-is (giving a ZWNBSP character). */
2815 if (bo == 0) {
2816 if (size >= 4) {
2817 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002820 if (bom == 0x0000FEFF) {
2821 q += 4;
2822 bo = -1;
2823 }
2824 else if (bom == 0xFFFE0000) {
2825 q += 4;
2826 bo = 1;
2827 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002828#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 if (bom == 0x0000FEFF) {
2830 q += 4;
2831 bo = 1;
2832 }
2833 else if (bom == 0xFFFE0000) {
2834 q += 4;
2835 bo = -1;
2836 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002837#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002839 }
2840
2841 if (bo == -1) {
2842 /* force LE */
2843 iorder[0] = 0;
2844 iorder[1] = 1;
2845 iorder[2] = 2;
2846 iorder[3] = 3;
2847 }
2848 else if (bo == 1) {
2849 /* force BE */
2850 iorder[0] = 3;
2851 iorder[1] = 2;
2852 iorder[2] = 1;
2853 iorder[3] = 0;
2854 }
2855
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002856 /* On narrow builds we split characters outside the BMP into two
2857 codepoints => count how much extra space we need. */
2858#ifndef Py_UNICODE_WIDE
2859 for (qq = q; qq < e; qq += 4)
2860 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2861 pairs++;
2862#endif
2863
2864 /* This might be one to much, because of a BOM */
2865 unicode = _PyUnicode_New((size+3)/4+pairs);
2866 if (!unicode)
2867 return NULL;
2868 if (size == 0)
2869 return (PyObject *)unicode;
2870
2871 /* Unpack UTF-32 encoded data */
2872 p = unicode->str;
2873
Walter Dörwald41980ca2007-08-16 21:55:45 +00002874 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_UCS4 ch;
2876 /* remaining bytes at the end? (size should be divisible by 4) */
2877 if (e-q<4) {
2878 if (consumed)
2879 break;
2880 errmsg = "truncated data";
2881 startinpos = ((const char *)q)-starts;
2882 endinpos = ((const char *)e)-starts;
2883 goto utf32Error;
2884 /* The remaining input chars are ignored if the callback
2885 chooses to skip the input */
2886 }
2887 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2888 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002889
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 if (ch >= 0x110000)
2891 {
2892 errmsg = "codepoint not in range(0x110000)";
2893 startinpos = ((const char *)q)-starts;
2894 endinpos = startinpos+4;
2895 goto utf32Error;
2896 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002897#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 if (ch >= 0x10000)
2899 {
2900 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2901 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2902 }
2903 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002904#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 *p++ = ch;
2906 q += 4;
2907 continue;
2908 utf32Error:
2909 outpos = p-PyUnicode_AS_UNICODE(unicode);
2910 if (unicode_decode_call_errorhandler(
2911 errors, &errorHandler,
2912 "utf32", errmsg,
2913 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2914 &unicode, &outpos, &p))
2915 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002916 }
2917
2918 if (byteorder)
2919 *byteorder = bo;
2920
2921 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002923
2924 /* Adjust length */
2925 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2926 goto onError;
2927
2928 Py_XDECREF(errorHandler);
2929 Py_XDECREF(exc);
2930 return (PyObject *)unicode;
2931
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933 Py_DECREF(unicode);
2934 Py_XDECREF(errorHandler);
2935 Py_XDECREF(exc);
2936 return NULL;
2937}
2938
2939PyObject *
2940PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 Py_ssize_t size,
2942 const char *errors,
2943 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002944{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002945 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002946 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002947 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002948#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002949 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002950#else
2951 const int pairs = 0;
2952#endif
2953 /* Offsets from p for storing byte pairs in the right order. */
2954#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2955 int iorder[] = {0, 1, 2, 3};
2956#else
2957 int iorder[] = {3, 2, 1, 0};
2958#endif
2959
Benjamin Peterson29060642009-01-31 22:14:21 +00002960#define STORECHAR(CH) \
2961 do { \
2962 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2963 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2964 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2965 p[iorder[0]] = (CH) & 0xff; \
2966 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002967 } while(0)
2968
2969 /* In narrow builds we can output surrogate pairs as one codepoint,
2970 so we need less space. */
2971#ifndef Py_UNICODE_WIDE
2972 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2974 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2975 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002976#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002977 nsize = (size - pairs + (byteorder == 0));
2978 bytesize = nsize * 4;
2979 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002981 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002982 if (v == NULL)
2983 return NULL;
2984
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002985 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002986 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002988 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002989 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002990
2991 if (byteorder == -1) {
2992 /* force LE */
2993 iorder[0] = 0;
2994 iorder[1] = 1;
2995 iorder[2] = 2;
2996 iorder[3] = 3;
2997 }
2998 else if (byteorder == 1) {
2999 /* force BE */
3000 iorder[0] = 3;
3001 iorder[1] = 2;
3002 iorder[2] = 1;
3003 iorder[3] = 0;
3004 }
3005
3006 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003008#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3010 Py_UCS4 ch2 = *s;
3011 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3012 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3013 s++;
3014 size--;
3015 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003017#endif
3018 STORECHAR(ch);
3019 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003020
3021 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003022 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003023#undef STORECHAR
3024}
3025
3026PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3027{
3028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
3030 return NULL;
3031 }
3032 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyUnicode_GET_SIZE(unicode),
3034 NULL,
3035 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003036}
3037
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038/* --- UTF-16 Codec ------------------------------------------------------- */
3039
Tim Peters772747b2001-08-09 22:21:55 +00003040PyObject *
3041PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_ssize_t size,
3043 const char *errors,
3044 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
Walter Dörwald69652032004-09-07 20:24:22 +00003046 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3047}
3048
Antoine Pitrouab868312009-01-10 15:40:25 +00003049/* Two masks for fast checking of whether a C 'long' may contain
3050 UTF16-encoded surrogate characters. This is an efficient heuristic,
3051 assuming that non-surrogate characters with a code point >= 0x8000 are
3052 rare in most input.
3053 FAST_CHAR_MASK is used when the input is in native byte ordering,
3054 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003055*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003056#if (SIZEOF_LONG == 8)
3057# define FAST_CHAR_MASK 0x8000800080008000L
3058# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3059#elif (SIZEOF_LONG == 4)
3060# define FAST_CHAR_MASK 0x80008000L
3061# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3062#else
3063# error C 'long' size should be either 4 or 8!
3064#endif
3065
Walter Dörwald69652032004-09-07 20:24:22 +00003066PyObject *
3067PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 Py_ssize_t size,
3069 const char *errors,
3070 int *byteorder,
3071 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003072{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003073 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003074 Py_ssize_t startinpos;
3075 Py_ssize_t endinpos;
3076 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 PyUnicodeObject *unicode;
3078 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003079 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003080 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003081 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003082 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003083 /* Offsets from q for retrieving byte pairs in the right order. */
3084#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3085 int ihi = 1, ilo = 0;
3086#else
3087 int ihi = 0, ilo = 1;
3088#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 PyObject *errorHandler = NULL;
3090 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
3092 /* Note: size will always be longer than the resulting Unicode
3093 character count */
3094 unicode = _PyUnicode_New(size);
3095 if (!unicode)
3096 return NULL;
3097 if (size == 0)
3098 return (PyObject *)unicode;
3099
3100 /* Unpack UTF-16 encoded data */
3101 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003102 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003103 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
3105 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003106 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003108 /* Check for BOM marks (U+FEFF) in the input and adjust current
3109 byte order setting accordingly. In native mode, the leading BOM
3110 mark is skipped, in all other modes, it is copied to the output
3111 stream as-is (giving a ZWNBSP character). */
3112 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003113 if (size >= 2) {
3114 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003115#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 if (bom == 0xFEFF) {
3117 q += 2;
3118 bo = -1;
3119 }
3120 else if (bom == 0xFFFE) {
3121 q += 2;
3122 bo = 1;
3123 }
Tim Petersced69f82003-09-16 20:30:58 +00003124#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 if (bom == 0xFEFF) {
3126 q += 2;
3127 bo = 1;
3128 }
3129 else if (bom == 0xFFFE) {
3130 q += 2;
3131 bo = -1;
3132 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003133#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136
Tim Peters772747b2001-08-09 22:21:55 +00003137 if (bo == -1) {
3138 /* force LE */
3139 ihi = 1;
3140 ilo = 0;
3141 }
3142 else if (bo == 1) {
3143 /* force BE */
3144 ihi = 0;
3145 ilo = 1;
3146 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003147#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3148 native_ordering = ilo < ihi;
3149#else
3150 native_ordering = ilo > ihi;
3151#endif
Tim Peters772747b2001-08-09 22:21:55 +00003152
Antoine Pitrouab868312009-01-10 15:40:25 +00003153 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003154 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003156 /* First check for possible aligned read of a C 'long'. Unaligned
3157 reads are more expensive, better to defer to another iteration. */
3158 if (!((size_t) q & LONG_PTR_MASK)) {
3159 /* Fast path for runs of non-surrogate chars. */
3160 register const unsigned char *_q = q;
3161 Py_UNICODE *_p = p;
3162 if (native_ordering) {
3163 /* Native ordering is simple: as long as the input cannot
3164 possibly contain a surrogate char, do an unrolled copy
3165 of several 16-bit code points to the target object.
3166 The non-surrogate check is done on several input bytes
3167 at a time (as many as a C 'long' can contain). */
3168 while (_q < aligned_end) {
3169 unsigned long data = * (unsigned long *) _q;
3170 if (data & FAST_CHAR_MASK)
3171 break;
3172 _p[0] = ((unsigned short *) _q)[0];
3173 _p[1] = ((unsigned short *) _q)[1];
3174#if (SIZEOF_LONG == 8)
3175 _p[2] = ((unsigned short *) _q)[2];
3176 _p[3] = ((unsigned short *) _q)[3];
3177#endif
3178 _q += SIZEOF_LONG;
3179 _p += SIZEOF_LONG / 2;
3180 }
3181 }
3182 else {
3183 /* Byteswapped ordering is similar, but we must decompose
3184 the copy bytewise, and take care of zero'ing out the
3185 upper bytes if the target object is in 32-bit units
3186 (that is, in UCS-4 builds). */
3187 while (_q < aligned_end) {
3188 unsigned long data = * (unsigned long *) _q;
3189 if (data & SWAPPED_FAST_CHAR_MASK)
3190 break;
3191 /* Zero upper bytes in UCS-4 builds */
3192#if (Py_UNICODE_SIZE > 2)
3193 _p[0] = 0;
3194 _p[1] = 0;
3195#if (SIZEOF_LONG == 8)
3196 _p[2] = 0;
3197 _p[3] = 0;
3198#endif
3199#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003200 /* Issue #4916; UCS-4 builds on big endian machines must
3201 fill the two last bytes of each 4-byte unit. */
3202#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3203# define OFF 2
3204#else
3205# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003206#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003207 ((unsigned char *) _p)[OFF + 1] = _q[0];
3208 ((unsigned char *) _p)[OFF + 0] = _q[1];
3209 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3210 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3211#if (SIZEOF_LONG == 8)
3212 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3213 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3214 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3215 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3216#endif
3217#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003218 _q += SIZEOF_LONG;
3219 _p += SIZEOF_LONG / 2;
3220 }
3221 }
3222 p = _p;
3223 q = _q;
3224 if (q >= e)
3225 break;
3226 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228
Benjamin Peterson14339b62009-01-31 16:36:08 +00003229 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003230
3231 if (ch < 0xD800 || ch > 0xDFFF) {
3232 *p++ = ch;
3233 continue;
3234 }
3235
3236 /* UTF-16 code pair: */
3237 if (q > e) {
3238 errmsg = "unexpected end of data";
3239 startinpos = (((const char *)q) - 2) - starts;
3240 endinpos = ((const char *)e) + 1 - starts;
3241 goto utf16Error;
3242 }
3243 if (0xD800 <= ch && ch <= 0xDBFF) {
3244 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3245 q += 2;
3246 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003247#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 *p++ = ch;
3249 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003250#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003252#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 continue;
3254 }
3255 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003256 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 startinpos = (((const char *)q)-4)-starts;
3258 endinpos = startinpos+2;
3259 goto utf16Error;
3260 }
3261
Benjamin Peterson14339b62009-01-31 16:36:08 +00003262 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 errmsg = "illegal encoding";
3264 startinpos = (((const char *)q)-2)-starts;
3265 endinpos = startinpos+2;
3266 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003267
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 utf16Error:
3269 outpos = p - PyUnicode_AS_UNICODE(unicode);
3270 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003271 errors,
3272 &errorHandler,
3273 "utf16", errmsg,
3274 &starts,
3275 (const char **)&e,
3276 &startinpos,
3277 &endinpos,
3278 &exc,
3279 (const char **)&q,
3280 &unicode,
3281 &outpos,
3282 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003285 /* remaining byte at the end? (size should be even) */
3286 if (e == q) {
3287 if (!consumed) {
3288 errmsg = "truncated data";
3289 startinpos = ((const char *)q) - starts;
3290 endinpos = ((const char *)e) + 1 - starts;
3291 outpos = p - PyUnicode_AS_UNICODE(unicode);
3292 if (unicode_decode_call_errorhandler(
3293 errors,
3294 &errorHandler,
3295 "utf16", errmsg,
3296 &starts,
3297 (const char **)&e,
3298 &startinpos,
3299 &endinpos,
3300 &exc,
3301 (const char **)&q,
3302 &unicode,
3303 &outpos,
3304 &p))
3305 goto onError;
3306 /* The remaining input chars are ignored if the callback
3307 chooses to skip the input */
3308 }
3309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
3311 if (byteorder)
3312 *byteorder = bo;
3313
Walter Dörwald69652032004-09-07 20:24:22 +00003314 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003315 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003316
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003318 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 goto onError;
3320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 Py_XDECREF(errorHandler);
3322 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 return (PyObject *)unicode;
3324
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 Py_XDECREF(errorHandler);
3328 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 return NULL;
3330}
3331
Antoine Pitrouab868312009-01-10 15:40:25 +00003332#undef FAST_CHAR_MASK
3333#undef SWAPPED_FAST_CHAR_MASK
3334
Tim Peters772747b2001-08-09 22:21:55 +00003335PyObject *
3336PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 Py_ssize_t size,
3338 const char *errors,
3339 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003341 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003342 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003343 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003344#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003345 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003346#else
3347 const int pairs = 0;
3348#endif
Tim Peters772747b2001-08-09 22:21:55 +00003349 /* Offsets from p for storing byte pairs in the right order. */
3350#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3351 int ihi = 1, ilo = 0;
3352#else
3353 int ihi = 0, ilo = 1;
3354#endif
3355
Benjamin Peterson29060642009-01-31 22:14:21 +00003356#define STORECHAR(CH) \
3357 do { \
3358 p[ihi] = ((CH) >> 8) & 0xff; \
3359 p[ilo] = (CH) & 0xff; \
3360 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003361 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003363#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003364 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003365 if (s[i] >= 0x10000)
3366 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003367#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003368 /* 2 * (size + pairs + (byteorder == 0)) */
3369 if (size > PY_SSIZE_T_MAX ||
3370 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003372 nsize = size + pairs + (byteorder == 0);
3373 bytesize = nsize * 2;
3374 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003376 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 if (v == NULL)
3378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003380 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003383 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003384 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003385
3386 if (byteorder == -1) {
3387 /* force LE */
3388 ihi = 1;
3389 ilo = 0;
3390 }
3391 else if (byteorder == 1) {
3392 /* force BE */
3393 ihi = 0;
3394 ilo = 1;
3395 }
3396
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003397 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 Py_UNICODE ch = *s++;
3399 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003400#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 if (ch >= 0x10000) {
3402 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3403 ch = 0xD800 | ((ch-0x10000) >> 10);
3404 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003405#endif
Tim Peters772747b2001-08-09 22:21:55 +00003406 STORECHAR(ch);
3407 if (ch2)
3408 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003409 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003410
3411 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003412 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003413#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414}
3415
3416PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3417{
3418 if (!PyUnicode_Check(unicode)) {
3419 PyErr_BadArgument();
3420 return NULL;
3421 }
3422 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003423 PyUnicode_GET_SIZE(unicode),
3424 NULL,
3425 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426}
3427
3428/* --- Unicode Escape Codec ----------------------------------------------- */
3429
Fredrik Lundh06d12682001-01-24 07:59:11 +00003430static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003431
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 Py_ssize_t size,
3434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 Py_ssize_t startinpos;
3438 Py_ssize_t endinpos;
3439 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003444 char* message;
3445 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 PyObject *errorHandler = NULL;
3447 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 /* Escaped strings will always be longer than the resulting
3450 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 length after conversion to the true value.
3452 (but if the error callback returns a long replacement string
3453 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 v = _PyUnicode_New(size);
3455 if (v == NULL)
3456 goto onError;
3457 if (size == 0)
3458 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003462
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 while (s < end) {
3464 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003465 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467
3468 /* Non-escape characters are interpreted as Unicode ordinals */
3469 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003470 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 continue;
3472 }
3473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 /* \ - Escapes */
3476 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003477 c = *s++;
3478 if (s > end)
3479 c = '\0'; /* Invalid after \ */
3480 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 case '\n': break;
3484 case '\\': *p++ = '\\'; break;
3485 case '\'': *p++ = '\''; break;
3486 case '\"': *p++ = '\"'; break;
3487 case 'b': *p++ = '\b'; break;
3488 case 'f': *p++ = '\014'; break; /* FF */
3489 case 't': *p++ = '\t'; break;
3490 case 'n': *p++ = '\n'; break;
3491 case 'r': *p++ = '\r'; break;
3492 case 'v': *p++ = '\013'; break; /* VT */
3493 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3494
Benjamin Peterson29060642009-01-31 22:14:21 +00003495 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 case '0': case '1': case '2': case '3':
3497 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003498 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003499 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003500 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003501 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003502 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003504 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 break;
3506
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 /* hex escapes */
3508 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003510 digits = 2;
3511 message = "truncated \\xXX escape";
3512 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003516 digits = 4;
3517 message = "truncated \\uXXXX escape";
3518 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003521 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003522 digits = 8;
3523 message = "truncated \\UXXXXXXXX escape";
3524 hexescape:
3525 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 outpos = p-PyUnicode_AS_UNICODE(v);
3527 if (s+digits>end) {
3528 endinpos = size;
3529 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003530 errors, &errorHandler,
3531 "unicodeescape", "end of string in escape sequence",
3532 &starts, &end, &startinpos, &endinpos, &exc, &s,
3533 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 goto onError;
3535 goto nextByte;
3536 }
3537 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003538 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003539 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 endinpos = (s+i+1)-starts;
3541 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 errors, &errorHandler,
3543 "unicodeescape", message,
3544 &starts, &end, &startinpos, &endinpos, &exc, &s,
3545 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003546 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003548 }
3549 chr = (chr<<4) & ~0xF;
3550 if (c >= '0' && c <= '9')
3551 chr += c - '0';
3552 else if (c >= 'a' && c <= 'f')
3553 chr += 10 + c - 'a';
3554 else
3555 chr += 10 + c - 'A';
3556 }
3557 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003558 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 /* _decoding_error will have already written into the
3560 target buffer. */
3561 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003562 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003563 /* when we get here, chr is a 32-bit unicode character */
3564 if (chr <= 0xffff)
3565 /* UCS-2 character */
3566 *p++ = (Py_UNICODE) chr;
3567 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003568 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003569 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003570#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003571 *p++ = chr;
3572#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003573 chr -= 0x10000L;
3574 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003575 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003576#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003577 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 endinpos = s-starts;
3579 outpos = p-PyUnicode_AS_UNICODE(v);
3580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 errors, &errorHandler,
3582 "unicodeescape", "illegal Unicode character",
3583 &starts, &end, &startinpos, &endinpos, &exc, &s,
3584 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003585 goto onError;
3586 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003587 break;
3588
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003590 case 'N':
3591 message = "malformed \\N character escape";
3592 if (ucnhash_CAPI == NULL) {
3593 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003594 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003595 if (ucnhash_CAPI == NULL)
3596 goto ucnhashError;
3597 }
3598 if (*s == '{') {
3599 const char *start = s+1;
3600 /* look for the closing brace */
3601 while (*s != '}' && s < end)
3602 s++;
3603 if (s > start && s < end && *s == '}') {
3604 /* found a name. look it up in the unicode database */
3605 message = "unknown Unicode character name";
3606 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003607 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003608 goto store;
3609 }
3610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 endinpos = s-starts;
3612 outpos = p-PyUnicode_AS_UNICODE(v);
3613 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003614 errors, &errorHandler,
3615 "unicodeescape", message,
3616 &starts, &end, &startinpos, &endinpos, &exc, &s,
3617 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003618 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003619 break;
3620
3621 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003622 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 message = "\\ at end of string";
3624 s--;
3625 endinpos = s-starts;
3626 outpos = p-PyUnicode_AS_UNICODE(v);
3627 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 errors, &errorHandler,
3629 "unicodeescape", message,
3630 &starts, &end, &startinpos, &endinpos, &exc, &s,
3631 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003632 goto onError;
3633 }
3634 else {
3635 *p++ = '\\';
3636 *p++ = (unsigned char)s[-1];
3637 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003638 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003643 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003645 Py_XDECREF(errorHandler);
3646 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003648
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003650 PyErr_SetString(
3651 PyExc_UnicodeError,
3652 "\\N escapes not supported (can't load unicodedata module)"
3653 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003654 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 Py_XDECREF(errorHandler);
3656 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003657 return NULL;
3658
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 Py_XDECREF(errorHandler);
3662 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 return NULL;
3664}
3665
3666/* Return a Unicode-Escape string version of the Unicode object.
3667
3668 If quotes is true, the string is enclosed in u"" or u'' quotes as
3669 appropriate.
3670
3671*/
3672
Thomas Wouters477c8d52006-05-27 19:21:47 +00003673Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 Py_ssize_t size,
3675 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003676{
3677 /* like wcschr, but doesn't stop at NULL characters */
3678
3679 while (size-- > 0) {
3680 if (*s == ch)
3681 return s;
3682 s++;
3683 }
3684
3685 return NULL;
3686}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003687
Walter Dörwald79e913e2007-05-12 11:08:06 +00003688static const char *hexdigits = "0123456789abcdef";
3689
3690PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003693 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003696#ifdef Py_UNICODE_WIDE
3697 const Py_ssize_t expandsize = 10;
3698#else
3699 const Py_ssize_t expandsize = 6;
3700#endif
3701
Thomas Wouters89f507f2006-12-13 04:49:30 +00003702 /* XXX(nnorwitz): rather than over-allocating, it would be
3703 better to choose a different scheme. Perhaps scan the
3704 first N-chars of the string and allocate based on that size.
3705 */
3706 /* Initial allocation is based on the longest-possible unichr
3707 escape.
3708
3709 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3710 unichr, so in this case it's the longest unichr escape. In
3711 narrow (UTF-16) builds this is five chars per source unichr
3712 since there are two unichrs in the surrogate pair, so in narrow
3713 (UTF-16) builds it's not the longest unichr escape.
3714
3715 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3716 so in the narrow (UTF-16) build case it's the longest unichr
3717 escape.
3718 */
3719
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003720 if (size == 0)
3721 return PyBytes_FromStringAndSize(NULL, 0);
3722
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003723 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003725
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003726 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 2
3728 + expandsize*size
3729 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 if (repr == NULL)
3731 return NULL;
3732
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003733 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 while (size-- > 0) {
3736 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003737
Walter Dörwald79e913e2007-05-12 11:08:06 +00003738 /* Escape backslashes */
3739 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 *p++ = '\\';
3741 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003742 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003743 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003744
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003746 /* Map 21-bit characters to '\U00xxxxxx' */
3747 else if (ch >= 0x10000) {
3748 *p++ = '\\';
3749 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003750 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3751 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3752 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3753 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3754 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3755 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3756 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3757 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003758 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003759 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003760#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3762 else if (ch >= 0xD800 && ch < 0xDC00) {
3763 Py_UNICODE ch2;
3764 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003765
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 ch2 = *s++;
3767 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003768 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3770 *p++ = '\\';
3771 *p++ = 'U';
3772 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3773 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3774 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3775 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3776 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3777 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3778 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3779 *p++ = hexdigits[ucs & 0x0000000F];
3780 continue;
3781 }
3782 /* Fall through: isolated surrogates are copied as-is */
3783 s--;
3784 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003785 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003786#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003787
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003789 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 *p++ = '\\';
3791 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003792 *p++ = hexdigits[(ch >> 12) & 0x000F];
3793 *p++ = hexdigits[(ch >> 8) & 0x000F];
3794 *p++ = hexdigits[(ch >> 4) & 0x000F];
3795 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003797
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003798 /* Map special whitespace to '\t', \n', '\r' */
3799 else if (ch == '\t') {
3800 *p++ = '\\';
3801 *p++ = 't';
3802 }
3803 else if (ch == '\n') {
3804 *p++ = '\\';
3805 *p++ = 'n';
3806 }
3807 else if (ch == '\r') {
3808 *p++ = '\\';
3809 *p++ = 'r';
3810 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003811
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003812 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003813 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003815 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003816 *p++ = hexdigits[(ch >> 4) & 0x000F];
3817 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003818 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 /* Copy everything else as-is */
3821 else
3822 *p++ = (char) ch;
3823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003825 assert(p - PyBytes_AS_STRING(repr) > 0);
3826 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3827 return NULL;
3828 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829}
3830
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003831PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003833 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 if (!PyUnicode_Check(unicode)) {
3835 PyErr_BadArgument();
3836 return NULL;
3837 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003838 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3839 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003840 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841}
3842
3843/* --- Raw Unicode Escape Codec ------------------------------------------- */
3844
3845PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003846 Py_ssize_t size,
3847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003850 Py_ssize_t startinpos;
3851 Py_ssize_t endinpos;
3852 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003854 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 const char *end;
3856 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 PyObject *errorHandler = NULL;
3858 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 /* Escaped strings will always be longer than the resulting
3861 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 length after conversion to the true value. (But decoding error
3863 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 v = _PyUnicode_New(size);
3865 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 end = s + size;
3871 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003872 unsigned char c;
3873 Py_UCS4 x;
3874 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003875 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876
Benjamin Peterson29060642009-01-31 22:14:21 +00003877 /* Non-escape characters are interpreted as Unicode ordinals */
3878 if (*s != '\\') {
3879 *p++ = (unsigned char)*s++;
3880 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003881 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 startinpos = s-starts;
3883
3884 /* \u-escapes are only interpreted iff the number of leading
3885 backslashes if odd */
3886 bs = s;
3887 for (;s < end;) {
3888 if (*s != '\\')
3889 break;
3890 *p++ = (unsigned char)*s++;
3891 }
3892 if (((s - bs) & 1) == 0 ||
3893 s >= end ||
3894 (*s != 'u' && *s != 'U')) {
3895 continue;
3896 }
3897 p--;
3898 count = *s=='u' ? 4 : 8;
3899 s++;
3900
3901 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3902 outpos = p-PyUnicode_AS_UNICODE(v);
3903 for (x = 0, i = 0; i < count; ++i, ++s) {
3904 c = (unsigned char)*s;
3905 if (!ISXDIGIT(c)) {
3906 endinpos = s-starts;
3907 if (unicode_decode_call_errorhandler(
3908 errors, &errorHandler,
3909 "rawunicodeescape", "truncated \\uXXXX",
3910 &starts, &end, &startinpos, &endinpos, &exc, &s,
3911 &v, &outpos, &p))
3912 goto onError;
3913 goto nextByte;
3914 }
3915 x = (x<<4) & ~0xF;
3916 if (c >= '0' && c <= '9')
3917 x += c - '0';
3918 else if (c >= 'a' && c <= 'f')
3919 x += 10 + c - 'a';
3920 else
3921 x += 10 + c - 'A';
3922 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003923 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 /* UCS-2 character */
3925 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003926 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003927 /* UCS-4 character. Either store directly, or as
3928 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003929#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003931#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 x -= 0x10000L;
3933 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3934 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003935#endif
3936 } else {
3937 endinpos = s-starts;
3938 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003939 if (unicode_decode_call_errorhandler(
3940 errors, &errorHandler,
3941 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 &starts, &end, &startinpos, &endinpos, &exc, &s,
3943 &v, &outpos, &p))
3944 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003945 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003946 nextByte:
3947 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003949 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 Py_XDECREF(errorHandler);
3952 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003954
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 return NULL;
3960}
3961
3962PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003965 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 char *p;
3967 char *q;
3968
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003969#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003970 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003971#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003972 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003973#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003974
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003975 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003978 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 if (repr == NULL)
3980 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003981 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003982 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003984 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 while (size-- > 0) {
3986 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003987#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 /* Map 32-bit characters to '\Uxxxxxxxx' */
3989 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003990 *p++ = '\\';
3991 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003992 *p++ = hexdigits[(ch >> 28) & 0xf];
3993 *p++ = hexdigits[(ch >> 24) & 0xf];
3994 *p++ = hexdigits[(ch >> 20) & 0xf];
3995 *p++ = hexdigits[(ch >> 16) & 0xf];
3996 *p++ = hexdigits[(ch >> 12) & 0xf];
3997 *p++ = hexdigits[(ch >> 8) & 0xf];
3998 *p++ = hexdigits[(ch >> 4) & 0xf];
3999 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004000 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004001 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004002#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4004 if (ch >= 0xD800 && ch < 0xDC00) {
4005 Py_UNICODE ch2;
4006 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004007
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 ch2 = *s++;
4009 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004010 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4012 *p++ = '\\';
4013 *p++ = 'U';
4014 *p++ = hexdigits[(ucs >> 28) & 0xf];
4015 *p++ = hexdigits[(ucs >> 24) & 0xf];
4016 *p++ = hexdigits[(ucs >> 20) & 0xf];
4017 *p++ = hexdigits[(ucs >> 16) & 0xf];
4018 *p++ = hexdigits[(ucs >> 12) & 0xf];
4019 *p++ = hexdigits[(ucs >> 8) & 0xf];
4020 *p++ = hexdigits[(ucs >> 4) & 0xf];
4021 *p++ = hexdigits[ucs & 0xf];
4022 continue;
4023 }
4024 /* Fall through: isolated surrogates are copied as-is */
4025 s--;
4026 size++;
4027 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004028#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 /* Map 16-bit characters to '\uxxxx' */
4030 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 *p++ = '\\';
4032 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004033 *p++ = hexdigits[(ch >> 12) & 0xf];
4034 *p++ = hexdigits[(ch >> 8) & 0xf];
4035 *p++ = hexdigits[(ch >> 4) & 0xf];
4036 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 /* Copy everything else as-is */
4039 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 *p++ = (char) ch;
4041 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004042 size = p - q;
4043
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004044 assert(size > 0);
4045 if (_PyBytes_Resize(&repr, size) < 0)
4046 return NULL;
4047 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048}
4049
4050PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4051{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004052 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004054 PyErr_BadArgument();
4055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004057 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4058 PyUnicode_GET_SIZE(unicode));
4059
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004060 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061}
4062
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004063/* --- Unicode Internal Codec ------------------------------------------- */
4064
4065PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 Py_ssize_t size,
4067 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004068{
4069 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t startinpos;
4071 Py_ssize_t endinpos;
4072 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004073 PyUnicodeObject *v;
4074 Py_UNICODE *p;
4075 const char *end;
4076 const char *reason;
4077 PyObject *errorHandler = NULL;
4078 PyObject *exc = NULL;
4079
Neal Norwitzd43069c2006-01-08 01:12:10 +00004080#ifdef Py_UNICODE_WIDE
4081 Py_UNICODE unimax = PyUnicode_GetMax();
4082#endif
4083
Thomas Wouters89f507f2006-12-13 04:49:30 +00004084 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004085 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4086 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004088 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004090 p = PyUnicode_AS_UNICODE(v);
4091 end = s + size;
4092
4093 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004094 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004095 /* We have to sanity check the raw data, otherwise doom looms for
4096 some malformed UCS-4 data. */
4097 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004098#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004099 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004100#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004101 end-s < Py_UNICODE_SIZE
4102 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004104 startinpos = s - starts;
4105 if (end-s < Py_UNICODE_SIZE) {
4106 endinpos = end-starts;
4107 reason = "truncated input";
4108 }
4109 else {
4110 endinpos = s - starts + Py_UNICODE_SIZE;
4111 reason = "illegal code point (> 0x10FFFF)";
4112 }
4113 outpos = p - PyUnicode_AS_UNICODE(v);
4114 if (unicode_decode_call_errorhandler(
4115 errors, &errorHandler,
4116 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004117 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004118 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004119 goto onError;
4120 }
4121 }
4122 else {
4123 p++;
4124 s += Py_UNICODE_SIZE;
4125 }
4126 }
4127
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004128 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004129 goto onError;
4130 Py_XDECREF(errorHandler);
4131 Py_XDECREF(exc);
4132 return (PyObject *)v;
4133
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004135 Py_XDECREF(v);
4136 Py_XDECREF(errorHandler);
4137 Py_XDECREF(exc);
4138 return NULL;
4139}
4140
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141/* --- Latin-1 Codec ------------------------------------------------------ */
4142
4143PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 Py_ssize_t size,
4145 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146{
4147 PyUnicodeObject *v;
4148 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004149 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004150
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004152 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 Py_UNICODE r = *(unsigned char*)s;
4154 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004155 }
4156
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 v = _PyUnicode_New(size);
4158 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004163 e = s + size;
4164 /* Unrolling the copy makes it much faster by reducing the looping
4165 overhead. This is similar to what many memcpy() implementations do. */
4166 unrolled_end = e - 4;
4167 while (s < unrolled_end) {
4168 p[0] = (unsigned char) s[0];
4169 p[1] = (unsigned char) s[1];
4170 p[2] = (unsigned char) s[2];
4171 p[3] = (unsigned char) s[3];
4172 s += 4;
4173 p += 4;
4174 }
4175 while (s < e)
4176 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004178
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 Py_XDECREF(v);
4181 return NULL;
4182}
4183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184/* create or adjust a UnicodeEncodeError */
4185static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 const char *encoding,
4187 const Py_UNICODE *unicode, Py_ssize_t size,
4188 Py_ssize_t startpos, Py_ssize_t endpos,
4189 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 *exceptionObject = PyUnicodeEncodeError_Create(
4193 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 }
4195 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4197 goto onError;
4198 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4199 goto onError;
4200 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4201 goto onError;
4202 return;
4203 onError:
4204 Py_DECREF(*exceptionObject);
4205 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 }
4207}
4208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209/* raises a UnicodeEncodeError */
4210static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 const char *encoding,
4212 const Py_UNICODE *unicode, Py_ssize_t size,
4213 Py_ssize_t startpos, Py_ssize_t endpos,
4214 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215{
4216 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220}
4221
4222/* error handling callback helper:
4223 build arguments, call the callback and check the arguments,
4224 put the result into newpos and return the replacement string, which
4225 has to be freed by the caller */
4226static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 PyObject **errorHandler,
4228 const char *encoding, const char *reason,
4229 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4230 Py_ssize_t startpos, Py_ssize_t endpos,
4231 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004233 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234
4235 PyObject *restuple;
4236 PyObject *resunicode;
4237
4238 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 }
4243
4244 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248
4249 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004254 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 Py_DECREF(restuple);
4256 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004258 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 &resunicode, newpos)) {
4260 Py_DECREF(restuple);
4261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004263 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4264 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4265 Py_DECREF(restuple);
4266 return NULL;
4267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004269 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004270 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4272 Py_DECREF(restuple);
4273 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004274 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 Py_INCREF(resunicode);
4276 Py_DECREF(restuple);
4277 return resunicode;
4278}
4279
4280static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 Py_ssize_t size,
4282 const char *errors,
4283 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284{
4285 /* output object */
4286 PyObject *res;
4287 /* pointers to the beginning and end+1 of input */
4288 const Py_UNICODE *startp = p;
4289 const Py_UNICODE *endp = p + size;
4290 /* pointer to the beginning of the unencodable characters */
4291 /* const Py_UNICODE *badp = NULL; */
4292 /* pointer into the output */
4293 char *str;
4294 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004295 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004296 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4297 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 PyObject *errorHandler = NULL;
4299 PyObject *exc = NULL;
4300 /* the following variable is used for caching string comparisons
4301 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4302 int known_errorHandler = -1;
4303
4304 /* allocate enough for a simple encoding without
4305 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004306 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004307 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004308 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004310 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004311 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 ressize = size;
4313
4314 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 /* can we encode this? */
4318 if (c<limit) {
4319 /* no overflow check, because we know that the space is enough */
4320 *str++ = (char)c;
4321 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 else {
4324 Py_ssize_t unicodepos = p-startp;
4325 Py_ssize_t requiredsize;
4326 PyObject *repunicode;
4327 Py_ssize_t repsize;
4328 Py_ssize_t newpos;
4329 Py_ssize_t respos;
4330 Py_UNICODE *uni2;
4331 /* startpos for collecting unencodable chars */
4332 const Py_UNICODE *collstart = p;
4333 const Py_UNICODE *collend = p;
4334 /* find all unecodable characters */
4335 while ((collend < endp) && ((*collend)>=limit))
4336 ++collend;
4337 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4338 if (known_errorHandler==-1) {
4339 if ((errors==NULL) || (!strcmp(errors, "strict")))
4340 known_errorHandler = 1;
4341 else if (!strcmp(errors, "replace"))
4342 known_errorHandler = 2;
4343 else if (!strcmp(errors, "ignore"))
4344 known_errorHandler = 3;
4345 else if (!strcmp(errors, "xmlcharrefreplace"))
4346 known_errorHandler = 4;
4347 else
4348 known_errorHandler = 0;
4349 }
4350 switch (known_errorHandler) {
4351 case 1: /* strict */
4352 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4353 goto onError;
4354 case 2: /* replace */
4355 while (collstart++<collend)
4356 *str++ = '?'; /* fall through */
4357 case 3: /* ignore */
4358 p = collend;
4359 break;
4360 case 4: /* xmlcharrefreplace */
4361 respos = str - PyBytes_AS_STRING(res);
4362 /* determine replacement size (temporarily (mis)uses p) */
4363 for (p = collstart, repsize = 0; p < collend; ++p) {
4364 if (*p<10)
4365 repsize += 2+1+1;
4366 else if (*p<100)
4367 repsize += 2+2+1;
4368 else if (*p<1000)
4369 repsize += 2+3+1;
4370 else if (*p<10000)
4371 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004372#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 else
4374 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004375#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 else if (*p<100000)
4377 repsize += 2+5+1;
4378 else if (*p<1000000)
4379 repsize += 2+6+1;
4380 else
4381 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004382#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 }
4384 requiredsize = respos+repsize+(endp-collend);
4385 if (requiredsize > ressize) {
4386 if (requiredsize<2*ressize)
4387 requiredsize = 2*ressize;
4388 if (_PyBytes_Resize(&res, requiredsize))
4389 goto onError;
4390 str = PyBytes_AS_STRING(res) + respos;
4391 ressize = requiredsize;
4392 }
4393 /* generate replacement (temporarily (mis)uses p) */
4394 for (p = collstart; p < collend; ++p) {
4395 str += sprintf(str, "&#%d;", (int)*p);
4396 }
4397 p = collend;
4398 break;
4399 default:
4400 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4401 encoding, reason, startp, size, &exc,
4402 collstart-startp, collend-startp, &newpos);
4403 if (repunicode == NULL)
4404 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004405 if (PyBytes_Check(repunicode)) {
4406 /* Directly copy bytes result to output. */
4407 repsize = PyBytes_Size(repunicode);
4408 if (repsize > 1) {
4409 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004410 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004411 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4412 Py_DECREF(repunicode);
4413 goto onError;
4414 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004415 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004416 ressize += repsize-1;
4417 }
4418 memcpy(str, PyBytes_AsString(repunicode), repsize);
4419 str += repsize;
4420 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004421 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004422 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 /* need more space? (at least enough for what we
4425 have+the replacement+the rest of the string, so
4426 we won't have to check space for encodable characters) */
4427 respos = str - PyBytes_AS_STRING(res);
4428 repsize = PyUnicode_GET_SIZE(repunicode);
4429 requiredsize = respos+repsize+(endp-collend);
4430 if (requiredsize > ressize) {
4431 if (requiredsize<2*ressize)
4432 requiredsize = 2*ressize;
4433 if (_PyBytes_Resize(&res, requiredsize)) {
4434 Py_DECREF(repunicode);
4435 goto onError;
4436 }
4437 str = PyBytes_AS_STRING(res) + respos;
4438 ressize = requiredsize;
4439 }
4440 /* check if there is anything unencodable in the replacement
4441 and copy it to the output */
4442 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4443 c = *uni2;
4444 if (c >= limit) {
4445 raise_encode_exception(&exc, encoding, startp, size,
4446 unicodepos, unicodepos+1, reason);
4447 Py_DECREF(repunicode);
4448 goto onError;
4449 }
4450 *str = (char)c;
4451 }
4452 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004453 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004455 }
4456 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004457 /* Resize if we allocated to much */
4458 size = str - PyBytes_AS_STRING(res);
4459 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004460 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004461 if (_PyBytes_Resize(&res, size) < 0)
4462 goto onError;
4463 }
4464
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004467 return res;
4468
4469 onError:
4470 Py_XDECREF(res);
4471 Py_XDECREF(errorHandler);
4472 Py_XDECREF(exc);
4473 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474}
4475
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 Py_ssize_t size,
4478 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481}
4482
4483PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4484{
4485 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 PyErr_BadArgument();
4487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 }
4489 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 PyUnicode_GET_SIZE(unicode),
4491 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492}
4493
4494/* --- 7-bit ASCII Codec -------------------------------------------------- */
4495
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 Py_ssize_t size,
4498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 PyUnicodeObject *v;
4502 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t startinpos;
4504 Py_ssize_t endinpos;
4505 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 const char *e;
4507 PyObject *errorHandler = NULL;
4508 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004509
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004511 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 Py_UNICODE r = *(unsigned char*)s;
4513 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004514 }
Tim Petersced69f82003-09-16 20:30:58 +00004515
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 v = _PyUnicode_New(size);
4517 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 e = s + size;
4523 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 register unsigned char c = (unsigned char)*s;
4525 if (c < 128) {
4526 *p++ = c;
4527 ++s;
4528 }
4529 else {
4530 startinpos = s-starts;
4531 endinpos = startinpos + 1;
4532 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4533 if (unicode_decode_call_errorhandler(
4534 errors, &errorHandler,
4535 "ascii", "ordinal not in range(128)",
4536 &starts, &e, &startinpos, &endinpos, &exc, &s,
4537 &v, &outpos, &p))
4538 goto onError;
4539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004541 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 Py_XDECREF(errorHandler);
4545 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004547
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 Py_XDECREF(errorHandler);
4551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 return NULL;
4553}
4554
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 Py_ssize_t size,
4557 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560}
4561
4562PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4563{
4564 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 PyErr_BadArgument();
4566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
4568 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 PyUnicode_GET_SIZE(unicode),
4570 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571}
4572
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004573#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004574
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004575/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004576
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004577#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578#define NEED_RETRY
4579#endif
4580
4581/* XXX This code is limited to "true" double-byte encodings, as
4582 a) it assumes an incomplete character consists of a single byte, and
4583 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004585
4586static int is_dbcs_lead_byte(const char *s, int offset)
4587{
4588 const char *curr = s + offset;
4589
4590 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 const char *prev = CharPrev(s, curr);
4592 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004593 }
4594 return 0;
4595}
4596
4597/*
4598 * Decode MBCS string into unicode object. If 'final' is set, converts
4599 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4600 */
4601static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 const char *s, /* MBCS string */
4603 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004604 int final,
4605 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004606{
4607 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004608 Py_ssize_t n;
4609 DWORD usize;
4610 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004611
4612 assert(size >= 0);
4613
Victor Stinner554f3f02010-06-16 23:33:54 +00004614 /* check and handle 'errors' arg */
4615 if (errors==NULL || strcmp(errors, "strict")==0)
4616 flags = MB_ERR_INVALID_CHARS;
4617 else if (strcmp(errors, "ignore")==0)
4618 flags = 0;
4619 else {
4620 PyErr_Format(PyExc_ValueError,
4621 "mbcs encoding does not support errors='%s'",
4622 errors);
4623 return -1;
4624 }
4625
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004626 /* Skip trailing lead-byte unless 'final' is set */
4627 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004629
4630 /* First get the size of the result */
4631 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004632 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4633 if (usize==0)
4634 goto mbcs_decode_error;
4635 } else
4636 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004637
4638 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 /* Create unicode object */
4640 *v = _PyUnicode_New(usize);
4641 if (*v == NULL)
4642 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004643 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004644 }
4645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 /* Extend unicode object */
4647 n = PyUnicode_GET_SIZE(*v);
4648 if (_PyUnicode_Resize(v, n + usize) < 0)
4649 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004650 }
4651
4652 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004653 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004655 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4656 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004660
4661mbcs_decode_error:
4662 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4663 we raise a UnicodeDecodeError - else it is a 'generic'
4664 windows error
4665 */
4666 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4667 /* Ideally, we should get reason from FormatMessage - this
4668 is the Windows 2000 English version of the message
4669 */
4670 PyObject *exc = NULL;
4671 const char *reason = "No mapping for the Unicode character exists "
4672 "in the target multi-byte code page.";
4673 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4674 if (exc != NULL) {
4675 PyCodec_StrictErrors(exc);
4676 Py_DECREF(exc);
4677 }
4678 } else {
4679 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4680 }
4681 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004682}
4683
4684PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 Py_ssize_t size,
4686 const char *errors,
4687 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004688{
4689 PyUnicodeObject *v = NULL;
4690 int done;
4691
4692 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004694
4695#ifdef NEED_RETRY
4696 retry:
4697 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004698 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004699 else
4700#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004701 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004702
4703 if (done < 0) {
4704 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 }
4707
4708 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004710
4711#ifdef NEED_RETRY
4712 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 s += done;
4714 size -= done;
4715 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004716 }
4717#endif
4718
4719 return (PyObject *)v;
4720}
4721
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004722PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 Py_ssize_t size,
4724 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004725{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004726 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4727}
4728
4729/*
4730 * Convert unicode into string object (MBCS).
4731 * Returns 0 if succeed, -1 otherwise.
4732 */
4733static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004735 int size, /* size of unicode */
4736 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004737{
Victor Stinner554f3f02010-06-16 23:33:54 +00004738 BOOL usedDefaultChar = FALSE;
4739 BOOL *pusedDefaultChar;
4740 int mbcssize;
4741 Py_ssize_t n;
4742 PyObject *exc = NULL;
4743 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004744
4745 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004746
Victor Stinner554f3f02010-06-16 23:33:54 +00004747 /* check and handle 'errors' arg */
4748 if (errors==NULL || strcmp(errors, "strict")==0) {
4749 flags = WC_NO_BEST_FIT_CHARS;
4750 pusedDefaultChar = &usedDefaultChar;
4751 } else if (strcmp(errors, "replace")==0) {
4752 flags = 0;
4753 pusedDefaultChar = NULL;
4754 } else {
4755 PyErr_Format(PyExc_ValueError,
4756 "mbcs encoding does not support errors='%s'",
4757 errors);
4758 return -1;
4759 }
4760
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004761 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004762 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004763 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4764 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 if (mbcssize == 0) {
4766 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4767 return -1;
4768 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004769 /* If we used a default char, then we failed! */
4770 if (pusedDefaultChar && *pusedDefaultChar)
4771 goto mbcs_encode_error;
4772 } else {
4773 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004774 }
4775
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004776 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 /* Create string object */
4778 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4779 if (*repr == NULL)
4780 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004781 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004782 }
4783 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 /* Extend string object */
4785 n = PyBytes_Size(*repr);
4786 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4787 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004788 }
4789
4790 /* Do the conversion */
4791 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004793 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4794 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4796 return -1;
4797 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004798 if (pusedDefaultChar && *pusedDefaultChar)
4799 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004800 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004801 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004802
4803mbcs_encode_error:
4804 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4805 Py_XDECREF(exc);
4806 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004807}
4808
4809PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 Py_ssize_t size,
4811 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004812{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004813 PyObject *repr = NULL;
4814 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004815
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004816#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004818 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004819 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004820 else
4821#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004822 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004823
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004824 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 Py_XDECREF(repr);
4826 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004827 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004828
4829#ifdef NEED_RETRY
4830 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 p += INT_MAX;
4832 size -= INT_MAX;
4833 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004834 }
4835#endif
4836
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004837 return repr;
4838}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004839
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004840PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4841{
4842 if (!PyUnicode_Check(unicode)) {
4843 PyErr_BadArgument();
4844 return NULL;
4845 }
4846 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 PyUnicode_GET_SIZE(unicode),
4848 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004849}
4850
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004851#undef NEED_RETRY
4852
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004853#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855/* --- Character Mapping Codec -------------------------------------------- */
4856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 Py_ssize_t size,
4859 PyObject *mapping,
4860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863 Py_ssize_t startinpos;
4864 Py_ssize_t endinpos;
4865 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 PyUnicodeObject *v;
4868 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 PyObject *errorHandler = NULL;
4871 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004872 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 /* Default to Latin-1 */
4876 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878
4879 v = _PyUnicode_New(size);
4880 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004886 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 mapstring = PyUnicode_AS_UNICODE(mapping);
4888 maplen = PyUnicode_GET_SIZE(mapping);
4889 while (s < e) {
4890 unsigned char ch = *s;
4891 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 if (ch < maplen)
4894 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 if (x == 0xfffe) {
4897 /* undefined mapping */
4898 outpos = p-PyUnicode_AS_UNICODE(v);
4899 startinpos = s-starts;
4900 endinpos = startinpos+1;
4901 if (unicode_decode_call_errorhandler(
4902 errors, &errorHandler,
4903 "charmap", "character maps to <undefined>",
4904 &starts, &e, &startinpos, &endinpos, &exc, &s,
4905 &v, &outpos, &p)) {
4906 goto onError;
4907 }
4908 continue;
4909 }
4910 *p++ = x;
4911 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004912 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004913 }
4914 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 while (s < e) {
4916 unsigned char ch = *s;
4917 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004918
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4920 w = PyLong_FromLong((long)ch);
4921 if (w == NULL)
4922 goto onError;
4923 x = PyObject_GetItem(mapping, w);
4924 Py_DECREF(w);
4925 if (x == NULL) {
4926 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4927 /* No mapping found means: mapping is undefined. */
4928 PyErr_Clear();
4929 x = Py_None;
4930 Py_INCREF(x);
4931 } else
4932 goto onError;
4933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004934
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 /* Apply mapping */
4936 if (PyLong_Check(x)) {
4937 long value = PyLong_AS_LONG(x);
4938 if (value < 0 || value > 65535) {
4939 PyErr_SetString(PyExc_TypeError,
4940 "character mapping must be in range(65536)");
4941 Py_DECREF(x);
4942 goto onError;
4943 }
4944 *p++ = (Py_UNICODE)value;
4945 }
4946 else if (x == Py_None) {
4947 /* undefined mapping */
4948 outpos = p-PyUnicode_AS_UNICODE(v);
4949 startinpos = s-starts;
4950 endinpos = startinpos+1;
4951 if (unicode_decode_call_errorhandler(
4952 errors, &errorHandler,
4953 "charmap", "character maps to <undefined>",
4954 &starts, &e, &startinpos, &endinpos, &exc, &s,
4955 &v, &outpos, &p)) {
4956 Py_DECREF(x);
4957 goto onError;
4958 }
4959 Py_DECREF(x);
4960 continue;
4961 }
4962 else if (PyUnicode_Check(x)) {
4963 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004964
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 if (targetsize == 1)
4966 /* 1-1 mapping */
4967 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 else if (targetsize > 1) {
4970 /* 1-n mapping */
4971 if (targetsize > extrachars) {
4972 /* resize first */
4973 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4974 Py_ssize_t needed = (targetsize - extrachars) + \
4975 (targetsize << 2);
4976 extrachars += needed;
4977 /* XXX overflow detection missing */
4978 if (_PyUnicode_Resize(&v,
4979 PyUnicode_GET_SIZE(v) + needed) < 0) {
4980 Py_DECREF(x);
4981 goto onError;
4982 }
4983 p = PyUnicode_AS_UNICODE(v) + oldpos;
4984 }
4985 Py_UNICODE_COPY(p,
4986 PyUnicode_AS_UNICODE(x),
4987 targetsize);
4988 p += targetsize;
4989 extrachars -= targetsize;
4990 }
4991 /* 1-0 mapping: skip the character */
4992 }
4993 else {
4994 /* wrong return value */
4995 PyErr_SetString(PyExc_TypeError,
4996 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004997 Py_DECREF(x);
4998 goto onError;
4999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 Py_DECREF(x);
5001 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 }
5004 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 Py_XDECREF(errorHandler);
5008 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005010
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 Py_XDECREF(errorHandler);
5013 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 Py_XDECREF(v);
5015 return NULL;
5016}
5017
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005018/* Charmap encoding: the lookup table */
5019
5020struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 PyObject_HEAD
5022 unsigned char level1[32];
5023 int count2, count3;
5024 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005025};
5026
5027static PyObject*
5028encoding_map_size(PyObject *obj, PyObject* args)
5029{
5030 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005031 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005033}
5034
5035static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005036 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 PyDoc_STR("Return the size (in bytes) of this object") },
5038 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005039};
5040
5041static void
5042encoding_map_dealloc(PyObject* o)
5043{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005044 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005045}
5046
5047static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005048 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 "EncodingMap", /*tp_name*/
5050 sizeof(struct encoding_map), /*tp_basicsize*/
5051 0, /*tp_itemsize*/
5052 /* methods */
5053 encoding_map_dealloc, /*tp_dealloc*/
5054 0, /*tp_print*/
5055 0, /*tp_getattr*/
5056 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005057 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 0, /*tp_repr*/
5059 0, /*tp_as_number*/
5060 0, /*tp_as_sequence*/
5061 0, /*tp_as_mapping*/
5062 0, /*tp_hash*/
5063 0, /*tp_call*/
5064 0, /*tp_str*/
5065 0, /*tp_getattro*/
5066 0, /*tp_setattro*/
5067 0, /*tp_as_buffer*/
5068 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5069 0, /*tp_doc*/
5070 0, /*tp_traverse*/
5071 0, /*tp_clear*/
5072 0, /*tp_richcompare*/
5073 0, /*tp_weaklistoffset*/
5074 0, /*tp_iter*/
5075 0, /*tp_iternext*/
5076 encoding_map_methods, /*tp_methods*/
5077 0, /*tp_members*/
5078 0, /*tp_getset*/
5079 0, /*tp_base*/
5080 0, /*tp_dict*/
5081 0, /*tp_descr_get*/
5082 0, /*tp_descr_set*/
5083 0, /*tp_dictoffset*/
5084 0, /*tp_init*/
5085 0, /*tp_alloc*/
5086 0, /*tp_new*/
5087 0, /*tp_free*/
5088 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005089};
5090
5091PyObject*
5092PyUnicode_BuildEncodingMap(PyObject* string)
5093{
5094 Py_UNICODE *decode;
5095 PyObject *result;
5096 struct encoding_map *mresult;
5097 int i;
5098 int need_dict = 0;
5099 unsigned char level1[32];
5100 unsigned char level2[512];
5101 unsigned char *mlevel1, *mlevel2, *mlevel3;
5102 int count2 = 0, count3 = 0;
5103
5104 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5105 PyErr_BadArgument();
5106 return NULL;
5107 }
5108 decode = PyUnicode_AS_UNICODE(string);
5109 memset(level1, 0xFF, sizeof level1);
5110 memset(level2, 0xFF, sizeof level2);
5111
5112 /* If there isn't a one-to-one mapping of NULL to \0,
5113 or if there are non-BMP characters, we need to use
5114 a mapping dictionary. */
5115 if (decode[0] != 0)
5116 need_dict = 1;
5117 for (i = 1; i < 256; i++) {
5118 int l1, l2;
5119 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005120#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005121 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005122#endif
5123 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005124 need_dict = 1;
5125 break;
5126 }
5127 if (decode[i] == 0xFFFE)
5128 /* unmapped character */
5129 continue;
5130 l1 = decode[i] >> 11;
5131 l2 = decode[i] >> 7;
5132 if (level1[l1] == 0xFF)
5133 level1[l1] = count2++;
5134 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005135 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005136 }
5137
5138 if (count2 >= 0xFF || count3 >= 0xFF)
5139 need_dict = 1;
5140
5141 if (need_dict) {
5142 PyObject *result = PyDict_New();
5143 PyObject *key, *value;
5144 if (!result)
5145 return NULL;
5146 for (i = 0; i < 256; i++) {
5147 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005148 key = PyLong_FromLong(decode[i]);
5149 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005150 if (!key || !value)
5151 goto failed1;
5152 if (PyDict_SetItem(result, key, value) == -1)
5153 goto failed1;
5154 Py_DECREF(key);
5155 Py_DECREF(value);
5156 }
5157 return result;
5158 failed1:
5159 Py_XDECREF(key);
5160 Py_XDECREF(value);
5161 Py_DECREF(result);
5162 return NULL;
5163 }
5164
5165 /* Create a three-level trie */
5166 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5167 16*count2 + 128*count3 - 1);
5168 if (!result)
5169 return PyErr_NoMemory();
5170 PyObject_Init(result, &EncodingMapType);
5171 mresult = (struct encoding_map*)result;
5172 mresult->count2 = count2;
5173 mresult->count3 = count3;
5174 mlevel1 = mresult->level1;
5175 mlevel2 = mresult->level23;
5176 mlevel3 = mresult->level23 + 16*count2;
5177 memcpy(mlevel1, level1, 32);
5178 memset(mlevel2, 0xFF, 16*count2);
5179 memset(mlevel3, 0, 128*count3);
5180 count3 = 0;
5181 for (i = 1; i < 256; i++) {
5182 int o1, o2, o3, i2, i3;
5183 if (decode[i] == 0xFFFE)
5184 /* unmapped character */
5185 continue;
5186 o1 = decode[i]>>11;
5187 o2 = (decode[i]>>7) & 0xF;
5188 i2 = 16*mlevel1[o1] + o2;
5189 if (mlevel2[i2] == 0xFF)
5190 mlevel2[i2] = count3++;
5191 o3 = decode[i] & 0x7F;
5192 i3 = 128*mlevel2[i2] + o3;
5193 mlevel3[i3] = i;
5194 }
5195 return result;
5196}
5197
5198static int
5199encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5200{
5201 struct encoding_map *map = (struct encoding_map*)mapping;
5202 int l1 = c>>11;
5203 int l2 = (c>>7) & 0xF;
5204 int l3 = c & 0x7F;
5205 int i;
5206
5207#ifdef Py_UNICODE_WIDE
5208 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005210 }
5211#endif
5212 if (c == 0)
5213 return 0;
5214 /* level 1*/
5215 i = map->level1[l1];
5216 if (i == 0xFF) {
5217 return -1;
5218 }
5219 /* level 2*/
5220 i = map->level23[16*i+l2];
5221 if (i == 0xFF) {
5222 return -1;
5223 }
5224 /* level 3 */
5225 i = map->level23[16*map->count2 + 128*i + l3];
5226 if (i == 0) {
5227 return -1;
5228 }
5229 return i;
5230}
5231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232/* Lookup the character ch in the mapping. If the character
5233 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005234 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Christian Heimes217cfd12007-12-02 14:31:20 +00005237 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 PyObject *x;
5239
5240 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 x = PyObject_GetItem(mapping, w);
5243 Py_DECREF(w);
5244 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5246 /* No mapping found means: mapping is undefined. */
5247 PyErr_Clear();
5248 x = Py_None;
5249 Py_INCREF(x);
5250 return x;
5251 } else
5252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005254 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005256 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 long value = PyLong_AS_LONG(x);
5258 if (value < 0 || value > 255) {
5259 PyErr_SetString(PyExc_TypeError,
5260 "character mapping must be in range(256)");
5261 Py_DECREF(x);
5262 return NULL;
5263 }
5264 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005266 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 /* wrong return value */
5270 PyErr_Format(PyExc_TypeError,
5271 "character mapping must return integer, bytes or None, not %.400s",
5272 x->ob_type->tp_name);
5273 Py_DECREF(x);
5274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 }
5276}
5277
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005278static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005279charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005280{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5282 /* exponentially overallocate to minimize reallocations */
5283 if (requiredsize < 2*outsize)
5284 requiredsize = 2*outsize;
5285 if (_PyBytes_Resize(outobj, requiredsize))
5286 return -1;
5287 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288}
5289
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005292}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005294 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295 space is available. Return a new reference to the object that
5296 was put in the output buffer, or Py_None, if the mapping was undefined
5297 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005298 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005299static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005300charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005303 PyObject *rep;
5304 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005305 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005306
Christian Heimes90aa7642007-12-19 02:45:37 +00005307 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005308 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005310 if (res == -1)
5311 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 if (outsize<requiredsize)
5313 if (charmapencode_resize(outobj, outpos, requiredsize))
5314 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005315 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 outstart[(*outpos)++] = (char)res;
5317 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005318 }
5319
5320 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005323 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 Py_DECREF(rep);
5325 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005326 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 if (PyLong_Check(rep)) {
5328 Py_ssize_t requiredsize = *outpos+1;
5329 if (outsize<requiredsize)
5330 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5331 Py_DECREF(rep);
5332 return enc_EXCEPTION;
5333 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005334 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 else {
5338 const char *repchars = PyBytes_AS_STRING(rep);
5339 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5340 Py_ssize_t requiredsize = *outpos+repsize;
5341 if (outsize<requiredsize)
5342 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5343 Py_DECREF(rep);
5344 return enc_EXCEPTION;
5345 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005346 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 memcpy(outstart + *outpos, repchars, repsize);
5348 *outpos += repsize;
5349 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005351 Py_DECREF(rep);
5352 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353}
5354
5355/* handle an error in PyUnicode_EncodeCharmap
5356 Return 0 on success, -1 on error */
5357static
5358int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005359 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005361 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005362 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363{
5364 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365 Py_ssize_t repsize;
5366 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 Py_UNICODE *uni2;
5368 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t collstartpos = *inpos;
5370 Py_ssize_t collendpos = *inpos+1;
5371 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 char *encoding = "charmap";
5373 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005374 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 /* find all unencodable characters */
5377 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005378 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005379 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 int res = encoding_map_lookup(p[collendpos], mapping);
5381 if (res != -1)
5382 break;
5383 ++collendpos;
5384 continue;
5385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 rep = charmapencode_lookup(p[collendpos], mapping);
5388 if (rep==NULL)
5389 return -1;
5390 else if (rep!=Py_None) {
5391 Py_DECREF(rep);
5392 break;
5393 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005394 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 }
5397 /* cache callback name lookup
5398 * (if not done yet, i.e. it's the first error) */
5399 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 if ((errors==NULL) || (!strcmp(errors, "strict")))
5401 *known_errorHandler = 1;
5402 else if (!strcmp(errors, "replace"))
5403 *known_errorHandler = 2;
5404 else if (!strcmp(errors, "ignore"))
5405 *known_errorHandler = 3;
5406 else if (!strcmp(errors, "xmlcharrefreplace"))
5407 *known_errorHandler = 4;
5408 else
5409 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 }
5411 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005412 case 1: /* strict */
5413 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5414 return -1;
5415 case 2: /* replace */
5416 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 x = charmapencode_output('?', mapping, res, respos);
5418 if (x==enc_EXCEPTION) {
5419 return -1;
5420 }
5421 else if (x==enc_FAILED) {
5422 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5423 return -1;
5424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005425 }
5426 /* fall through */
5427 case 3: /* ignore */
5428 *inpos = collendpos;
5429 break;
5430 case 4: /* xmlcharrefreplace */
5431 /* generate replacement (temporarily (mis)uses p) */
5432 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 char buffer[2+29+1+1];
5434 char *cp;
5435 sprintf(buffer, "&#%d;", (int)p[collpos]);
5436 for (cp = buffer; *cp; ++cp) {
5437 x = charmapencode_output(*cp, mapping, res, respos);
5438 if (x==enc_EXCEPTION)
5439 return -1;
5440 else if (x==enc_FAILED) {
5441 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5442 return -1;
5443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005444 }
5445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005446 *inpos = collendpos;
5447 break;
5448 default:
5449 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 encoding, reason, p, size, exceptionObject,
5451 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005452 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005454 if (PyBytes_Check(repunicode)) {
5455 /* Directly copy bytes result to output. */
5456 Py_ssize_t outsize = PyBytes_Size(*res);
5457 Py_ssize_t requiredsize;
5458 repsize = PyBytes_Size(repunicode);
5459 requiredsize = *respos + repsize;
5460 if (requiredsize > outsize)
5461 /* Make room for all additional bytes. */
5462 if (charmapencode_resize(res, respos, requiredsize)) {
5463 Py_DECREF(repunicode);
5464 return -1;
5465 }
5466 memcpy(PyBytes_AsString(*res) + *respos,
5467 PyBytes_AsString(repunicode), repsize);
5468 *respos += repsize;
5469 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005470 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005471 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 /* generate replacement */
5474 repsize = PyUnicode_GET_SIZE(repunicode);
5475 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 x = charmapencode_output(*uni2, mapping, res, respos);
5477 if (x==enc_EXCEPTION) {
5478 return -1;
5479 }
5480 else if (x==enc_FAILED) {
5481 Py_DECREF(repunicode);
5482 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5483 return -1;
5484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005485 }
5486 *inpos = newpos;
5487 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 }
5489 return 0;
5490}
5491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 Py_ssize_t size,
5494 PyObject *mapping,
5495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 /* output object */
5498 PyObject *res = NULL;
5499 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 PyObject *errorHandler = NULL;
5504 PyObject *exc = NULL;
5505 /* the following variable is used for caching string comparisons
5506 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5507 * 3=ignore, 4=xmlcharrefreplace */
5508 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
5510 /* Default to Latin-1 */
5511 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 /* allocate enough for a simple encoding without
5515 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005516 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 if (res == NULL)
5518 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005519 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 /* try to encode it */
5524 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5525 if (x==enc_EXCEPTION) /* error */
5526 goto onError;
5527 if (x==enc_FAILED) { /* unencodable character */
5528 if (charmap_encoding_error(p, size, &inpos, mapping,
5529 &exc,
5530 &known_errorHandler, &errorHandler, errors,
5531 &res, &respos)) {
5532 goto onError;
5533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 else
5536 /* done with this character => adjust input position */
5537 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005541 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005542 if (_PyBytes_Resize(&res, respos) < 0)
5543 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 Py_XDECREF(exc);
5546 Py_XDECREF(errorHandler);
5547 return res;
5548
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 Py_XDECREF(res);
5551 Py_XDECREF(exc);
5552 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 return NULL;
5554}
5555
5556PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
5559 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 PyErr_BadArgument();
5561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 }
5563 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 PyUnicode_GET_SIZE(unicode),
5565 mapping,
5566 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567}
5568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569/* create or adjust a UnicodeTranslateError */
5570static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 const Py_UNICODE *unicode, Py_ssize_t size,
5572 Py_ssize_t startpos, Py_ssize_t endpos,
5573 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005576 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 }
5579 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5581 goto onError;
5582 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5583 goto onError;
5584 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5585 goto onError;
5586 return;
5587 onError:
5588 Py_DECREF(*exceptionObject);
5589 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 }
5591}
5592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593/* raises a UnicodeTranslateError */
5594static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 const Py_UNICODE *unicode, Py_ssize_t size,
5596 Py_ssize_t startpos, Py_ssize_t endpos,
5597 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598{
5599 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005601 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603}
5604
5605/* error handling callback helper:
5606 build arguments, call the callback and check the arguments,
5607 put the result into newpos and return the replacement string, which
5608 has to be freed by the caller */
5609static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 PyObject **errorHandler,
5611 const char *reason,
5612 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5613 Py_ssize_t startpos, Py_ssize_t endpos,
5614 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005616 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005618 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 PyObject *restuple;
5620 PyObject *resunicode;
5621
5622 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005626 }
5627
5628 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632
5633 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005638 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 Py_DECREF(restuple);
5640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 }
5642 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 &resunicode, &i_newpos)) {
5644 Py_DECREF(restuple);
5645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005647 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005649 else
5650 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005651 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5653 Py_DECREF(restuple);
5654 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005655 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 Py_INCREF(resunicode);
5657 Py_DECREF(restuple);
5658 return resunicode;
5659}
5660
5661/* Lookup the character ch in the mapping and put the result in result,
5662 which must be decrefed by the caller.
5663 Return 0 on success, -1 on error */
5664static
5665int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5666{
Christian Heimes217cfd12007-12-02 14:31:20 +00005667 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 PyObject *x;
5669
5670 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 x = PyObject_GetItem(mapping, w);
5673 Py_DECREF(w);
5674 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5676 /* No mapping found means: use 1:1 mapping. */
5677 PyErr_Clear();
5678 *result = NULL;
5679 return 0;
5680 } else
5681 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682 }
5683 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 *result = x;
5685 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005687 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 long value = PyLong_AS_LONG(x);
5689 long max = PyUnicode_GetMax();
5690 if (value < 0 || value > max) {
5691 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005692 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 Py_DECREF(x);
5694 return -1;
5695 }
5696 *result = x;
5697 return 0;
5698 }
5699 else if (PyUnicode_Check(x)) {
5700 *result = x;
5701 return 0;
5702 }
5703 else {
5704 /* wrong return value */
5705 PyErr_SetString(PyExc_TypeError,
5706 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005707 Py_DECREF(x);
5708 return -1;
5709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710}
5711/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 if not reallocate and adjust various state variables.
5713 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714static
Walter Dörwald4894c302003-10-24 14:25:28 +00005715int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005719 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 /* remember old output position */
5721 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5722 /* exponentially overallocate to minimize reallocations */
5723 if (requiredsize < 2 * oldsize)
5724 requiredsize = 2 * oldsize;
5725 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5726 return -1;
5727 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 }
5729 return 0;
5730}
5731/* lookup the character, put the result in the output string and adjust
5732 various state variables. Return a new reference to the object that
5733 was put in the output buffer in *result, or Py_None, if the mapping was
5734 undefined (in which case no character was written).
5735 The called must decref result.
5736 Return 0 on success, -1 on error. */
5737static
Walter Dörwald4894c302003-10-24 14:25:28 +00005738int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5740 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741{
Walter Dörwald4894c302003-10-24 14:25:28 +00005742 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* not found => default to 1:1 mapping */
5746 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747 }
5748 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005750 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* no overflow check, because we know that the space is enough */
5752 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 }
5754 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5756 if (repsize==1) {
5757 /* no overflow check, because we know that the space is enough */
5758 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5759 }
5760 else if (repsize!=0) {
5761 /* more than one character */
5762 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5763 (insize - (curinp-startinp)) +
5764 repsize - 1;
5765 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5766 return -1;
5767 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5768 *outp += repsize;
5769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 }
5771 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 return 0;
5774}
5775
5776PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 Py_ssize_t size,
5778 PyObject *mapping,
5779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 /* output object */
5782 PyObject *res = NULL;
5783 /* pointers to the beginning and end+1 of input */
5784 const Py_UNICODE *startp = p;
5785 const Py_UNICODE *endp = p + size;
5786 /* pointer into the output */
5787 Py_UNICODE *str;
5788 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 char *reason = "character maps to <undefined>";
5791 PyObject *errorHandler = NULL;
5792 PyObject *exc = NULL;
5793 /* the following variable is used for caching string comparisons
5794 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5795 * 3=ignore, 4=xmlcharrefreplace */
5796 int known_errorHandler = -1;
5797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 PyErr_BadArgument();
5800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802
5803 /* allocate enough for a simple 1:1 translation without
5804 replacements, if we need more, we'll resize */
5805 res = PyUnicode_FromUnicode(NULL, size);
5806 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 /* try to encode it */
5814 PyObject *x = NULL;
5815 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5816 Py_XDECREF(x);
5817 goto onError;
5818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 if (x!=Py_None) /* it worked => adjust input pointer */
5821 ++p;
5822 else { /* untranslatable character */
5823 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5824 Py_ssize_t repsize;
5825 Py_ssize_t newpos;
5826 Py_UNICODE *uni2;
5827 /* startpos for collecting untranslatable chars */
5828 const Py_UNICODE *collstart = p;
5829 const Py_UNICODE *collend = p+1;
5830 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* find all untranslatable characters */
5833 while (collend < endp) {
5834 if (charmaptranslate_lookup(*collend, mapping, &x))
5835 goto onError;
5836 Py_XDECREF(x);
5837 if (x!=Py_None)
5838 break;
5839 ++collend;
5840 }
5841 /* cache callback name lookup
5842 * (if not done yet, i.e. it's the first error) */
5843 if (known_errorHandler==-1) {
5844 if ((errors==NULL) || (!strcmp(errors, "strict")))
5845 known_errorHandler = 1;
5846 else if (!strcmp(errors, "replace"))
5847 known_errorHandler = 2;
5848 else if (!strcmp(errors, "ignore"))
5849 known_errorHandler = 3;
5850 else if (!strcmp(errors, "xmlcharrefreplace"))
5851 known_errorHandler = 4;
5852 else
5853 known_errorHandler = 0;
5854 }
5855 switch (known_errorHandler) {
5856 case 1: /* strict */
5857 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005858 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 case 2: /* replace */
5860 /* No need to check for space, this is a 1:1 replacement */
5861 for (coll = collstart; coll<collend; ++coll)
5862 *str++ = '?';
5863 /* fall through */
5864 case 3: /* ignore */
5865 p = collend;
5866 break;
5867 case 4: /* xmlcharrefreplace */
5868 /* generate replacement (temporarily (mis)uses p) */
5869 for (p = collstart; p < collend; ++p) {
5870 char buffer[2+29+1+1];
5871 char *cp;
5872 sprintf(buffer, "&#%d;", (int)*p);
5873 if (charmaptranslate_makespace(&res, &str,
5874 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5875 goto onError;
5876 for (cp = buffer; *cp; ++cp)
5877 *str++ = *cp;
5878 }
5879 p = collend;
5880 break;
5881 default:
5882 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5883 reason, startp, size, &exc,
5884 collstart-startp, collend-startp, &newpos);
5885 if (repunicode == NULL)
5886 goto onError;
5887 /* generate replacement */
5888 repsize = PyUnicode_GET_SIZE(repunicode);
5889 if (charmaptranslate_makespace(&res, &str,
5890 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5891 Py_DECREF(repunicode);
5892 goto onError;
5893 }
5894 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5895 *str++ = *uni2;
5896 p = startp + newpos;
5897 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005899 }
5900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 /* Resize if we allocated to much */
5902 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005903 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 if (PyUnicode_Resize(&res, respos) < 0)
5905 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 }
5907 Py_XDECREF(exc);
5908 Py_XDECREF(errorHandler);
5909 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 Py_XDECREF(res);
5913 Py_XDECREF(exc);
5914 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 return NULL;
5916}
5917
5918PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 PyObject *mapping,
5920 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921{
5922 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005923
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 str = PyUnicode_FromObject(str);
5925 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 PyUnicode_GET_SIZE(str),
5929 mapping,
5930 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 Py_DECREF(str);
5932 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 Py_XDECREF(str);
5936 return NULL;
5937}
Tim Petersced69f82003-09-16 20:30:58 +00005938
Guido van Rossum9e896b32000-04-05 20:11:21 +00005939/* --- Decimal Encoder ---------------------------------------------------- */
5940
5941int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 Py_ssize_t length,
5943 char *output,
5944 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005945{
5946 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 PyObject *errorHandler = NULL;
5948 PyObject *exc = NULL;
5949 const char *encoding = "decimal";
5950 const char *reason = "invalid decimal Unicode string";
5951 /* the following variable is used for caching string comparisons
5952 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5953 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005954
5955 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 PyErr_BadArgument();
5957 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005958 }
5959
5960 p = s;
5961 end = s + length;
5962 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 register Py_UNICODE ch = *p;
5964 int decimal;
5965 PyObject *repunicode;
5966 Py_ssize_t repsize;
5967 Py_ssize_t newpos;
5968 Py_UNICODE *uni2;
5969 Py_UNICODE *collstart;
5970 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005971
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005973 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 ++p;
5975 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005976 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 decimal = Py_UNICODE_TODECIMAL(ch);
5978 if (decimal >= 0) {
5979 *output++ = '0' + decimal;
5980 ++p;
5981 continue;
5982 }
5983 if (0 < ch && ch < 256) {
5984 *output++ = (char)ch;
5985 ++p;
5986 continue;
5987 }
5988 /* All other characters are considered unencodable */
5989 collstart = p;
5990 collend = p+1;
5991 while (collend < end) {
5992 if ((0 < *collend && *collend < 256) ||
5993 !Py_UNICODE_ISSPACE(*collend) ||
5994 Py_UNICODE_TODECIMAL(*collend))
5995 break;
5996 }
5997 /* cache callback name lookup
5998 * (if not done yet, i.e. it's the first error) */
5999 if (known_errorHandler==-1) {
6000 if ((errors==NULL) || (!strcmp(errors, "strict")))
6001 known_errorHandler = 1;
6002 else if (!strcmp(errors, "replace"))
6003 known_errorHandler = 2;
6004 else if (!strcmp(errors, "ignore"))
6005 known_errorHandler = 3;
6006 else if (!strcmp(errors, "xmlcharrefreplace"))
6007 known_errorHandler = 4;
6008 else
6009 known_errorHandler = 0;
6010 }
6011 switch (known_errorHandler) {
6012 case 1: /* strict */
6013 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6014 goto onError;
6015 case 2: /* replace */
6016 for (p = collstart; p < collend; ++p)
6017 *output++ = '?';
6018 /* fall through */
6019 case 3: /* ignore */
6020 p = collend;
6021 break;
6022 case 4: /* xmlcharrefreplace */
6023 /* generate replacement (temporarily (mis)uses p) */
6024 for (p = collstart; p < collend; ++p)
6025 output += sprintf(output, "&#%d;", (int)*p);
6026 p = collend;
6027 break;
6028 default:
6029 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6030 encoding, reason, s, length, &exc,
6031 collstart-s, collend-s, &newpos);
6032 if (repunicode == NULL)
6033 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006034 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006035 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006036 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6037 Py_DECREF(repunicode);
6038 goto onError;
6039 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 /* generate replacement */
6041 repsize = PyUnicode_GET_SIZE(repunicode);
6042 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6043 Py_UNICODE ch = *uni2;
6044 if (Py_UNICODE_ISSPACE(ch))
6045 *output++ = ' ';
6046 else {
6047 decimal = Py_UNICODE_TODECIMAL(ch);
6048 if (decimal >= 0)
6049 *output++ = '0' + decimal;
6050 else if (0 < ch && ch < 256)
6051 *output++ = (char)ch;
6052 else {
6053 Py_DECREF(repunicode);
6054 raise_encode_exception(&exc, encoding,
6055 s, length, collstart-s, collend-s, reason);
6056 goto onError;
6057 }
6058 }
6059 }
6060 p = s + newpos;
6061 Py_DECREF(repunicode);
6062 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006063 }
6064 /* 0-terminate the output string */
6065 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 Py_XDECREF(exc);
6067 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006068 return 0;
6069
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 Py_XDECREF(exc);
6072 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006073 return -1;
6074}
6075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076/* --- Helpers ------------------------------------------------------------ */
6077
Eric Smith8c663262007-08-25 02:26:07 +00006078#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006079#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006080
Thomas Wouters477c8d52006-05-27 19:21:47 +00006081#include "stringlib/count.h"
6082#include "stringlib/find.h"
6083#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006084#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006085
Eric Smith5807c412008-05-11 21:00:57 +00006086#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006087#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006088#include "stringlib/localeutil.h"
6089
Thomas Wouters477c8d52006-05-27 19:21:47 +00006090/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006091#define ADJUST_INDICES(start, end, len) \
6092 if (end > len) \
6093 end = len; \
6094 else if (end < 0) { \
6095 end += len; \
6096 if (end < 0) \
6097 end = 0; \
6098 } \
6099 if (start < 0) { \
6100 start += len; \
6101 if (start < 0) \
6102 start = 0; \
6103 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006104
Martin v. Löwis18e16552006-02-15 17:27:45 +00006105Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106 PyObject *substr,
6107 Py_ssize_t start,
6108 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111 PyUnicodeObject* str_obj;
6112 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006113
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6115 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006117 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6118 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 Py_DECREF(str_obj);
6120 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 }
Tim Petersced69f82003-09-16 20:30:58 +00006122
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006123 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006124 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006125 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6126 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 );
6128
6129 Py_DECREF(sub_obj);
6130 Py_DECREF(str_obj);
6131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 return result;
6133}
6134
Martin v. Löwis18e16552006-02-15 17:27:45 +00006135Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006136 PyObject *sub,
6137 Py_ssize_t start,
6138 Py_ssize_t end,
6139 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006141 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006144 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006146 sub = PyUnicode_FromObject(sub);
6147 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 Py_DECREF(str);
6149 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 }
Tim Petersced69f82003-09-16 20:30:58 +00006151
Thomas Wouters477c8d52006-05-27 19:21:47 +00006152 if (direction > 0)
6153 result = stringlib_find_slice(
6154 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6155 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6156 start, end
6157 );
6158 else
6159 result = stringlib_rfind_slice(
6160 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6161 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6162 start, end
6163 );
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006166 Py_DECREF(sub);
6167
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 return result;
6169}
6170
Tim Petersced69f82003-09-16 20:30:58 +00006171static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 PyUnicodeObject *substring,
6174 Py_ssize_t start,
6175 Py_ssize_t end,
6176 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 if (substring->length == 0)
6179 return 1;
6180
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006181 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 end -= substring->length;
6183 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 if (Py_UNICODE_MATCH(self, end, substring))
6188 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 } else {
6190 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 }
6193
6194 return 0;
6195}
6196
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 PyObject *substr,
6199 Py_ssize_t start,
6200 Py_ssize_t end,
6201 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006204
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 str = PyUnicode_FromObject(str);
6206 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 substr = PyUnicode_FromObject(substr);
6209 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 Py_DECREF(str);
6211 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 }
Tim Petersced69f82003-09-16 20:30:58 +00006213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 (PyUnicodeObject *)substr,
6216 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 Py_DECREF(str);
6218 Py_DECREF(substr);
6219 return result;
6220}
6221
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222/* Apply fixfct filter to the Unicode object self and return a
6223 reference to the modified object */
6224
Tim Petersced69f82003-09-16 20:30:58 +00006225static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
6229
6230 PyUnicodeObject *u;
6231
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006232 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006235
6236 Py_UNICODE_COPY(u->str, self->str, self->length);
6237
Tim Peters7a29bd52001-09-12 03:03:31 +00006238 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 /* fixfct should return TRUE if it modified the buffer. If
6240 FALSE, return a reference to the original buffer instead
6241 (to save space, not time) */
6242 Py_INCREF(self);
6243 Py_DECREF(u);
6244 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 }
6246 return (PyObject*) u;
6247}
6248
Tim Petersced69f82003-09-16 20:30:58 +00006249static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250int fixupper(PyUnicodeObject *self)
6251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006252 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 Py_UNICODE *s = self->str;
6254 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 ch = Py_UNICODE_TOUPPER(*s);
6260 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 *s = ch;
6263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 s++;
6265 }
6266
6267 return status;
6268}
6269
Tim Petersced69f82003-09-16 20:30:58 +00006270static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271int fixlower(PyUnicodeObject *self)
6272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 Py_UNICODE *s = self->str;
6275 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 ch = Py_UNICODE_TOLOWER(*s);
6281 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 *s = ch;
6284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 s++;
6286 }
6287
6288 return status;
6289}
6290
Tim Petersced69f82003-09-16 20:30:58 +00006291static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292int fixswapcase(PyUnicodeObject *self)
6293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 Py_UNICODE *s = self->str;
6296 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 while (len-- > 0) {
6299 if (Py_UNICODE_ISUPPER(*s)) {
6300 *s = Py_UNICODE_TOLOWER(*s);
6301 status = 1;
6302 } else if (Py_UNICODE_ISLOWER(*s)) {
6303 *s = Py_UNICODE_TOUPPER(*s);
6304 status = 1;
6305 }
6306 s++;
6307 }
6308
6309 return status;
6310}
6311
Tim Petersced69f82003-09-16 20:30:58 +00006312static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313int fixcapitalize(PyUnicodeObject *self)
6314{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006315 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006316 Py_UNICODE *s = self->str;
6317 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006319 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006321 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 *s = Py_UNICODE_TOUPPER(*s);
6323 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006325 s++;
6326 while (--len > 0) {
6327 if (Py_UNICODE_ISUPPER(*s)) {
6328 *s = Py_UNICODE_TOLOWER(*s);
6329 status = 1;
6330 }
6331 s++;
6332 }
6333 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334}
6335
6336static
6337int fixtitle(PyUnicodeObject *self)
6338{
6339 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6340 register Py_UNICODE *e;
6341 int previous_is_cased;
6342
6343 /* Shortcut for single character strings */
6344 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6346 if (*p != ch) {
6347 *p = ch;
6348 return 1;
6349 }
6350 else
6351 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 }
Tim Petersced69f82003-09-16 20:30:58 +00006353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 e = p + PyUnicode_GET_SIZE(self);
6355 previous_is_cased = 0;
6356 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006358
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 if (previous_is_cased)
6360 *p = Py_UNICODE_TOLOWER(ch);
6361 else
6362 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 if (Py_UNICODE_ISLOWER(ch) ||
6365 Py_UNICODE_ISUPPER(ch) ||
6366 Py_UNICODE_ISTITLE(ch))
6367 previous_is_cased = 1;
6368 else
6369 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371 return 1;
6372}
6373
Tim Peters8ce9f162004-08-27 01:49:32 +00006374PyObject *
6375PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376{
Skip Montanaro6543b452004-09-16 03:28:13 +00006377 const Py_UNICODE blank = ' ';
6378 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006380 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006381 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6382 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006383 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6384 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006385 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006386 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Tim Peters05eba1f2004-08-27 21:32:02 +00006388 fseq = PySequence_Fast(seq, "");
6389 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006390 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006391 }
6392
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006393 /* NOTE: the following code can't call back into Python code,
6394 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006395 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006396
Tim Peters05eba1f2004-08-27 21:32:02 +00006397 seqlen = PySequence_Fast_GET_SIZE(fseq);
6398 /* If empty sequence, return u"". */
6399 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006400 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6401 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006402 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006403 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006404 /* If singleton sequence with an exact Unicode, return that. */
6405 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 item = items[0];
6407 if (PyUnicode_CheckExact(item)) {
6408 Py_INCREF(item);
6409 res = (PyUnicodeObject *)item;
6410 goto Done;
6411 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006412 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006413 else {
6414 /* Set up sep and seplen */
6415 if (separator == NULL) {
6416 sep = &blank;
6417 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006418 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006419 else {
6420 if (!PyUnicode_Check(separator)) {
6421 PyErr_Format(PyExc_TypeError,
6422 "separator: expected str instance,"
6423 " %.80s found",
6424 Py_TYPE(separator)->tp_name);
6425 goto onError;
6426 }
6427 sep = PyUnicode_AS_UNICODE(separator);
6428 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006429 }
6430 }
6431
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006432 /* There are at least two things to join, or else we have a subclass
6433 * of str in the sequence.
6434 * Do a pre-pass to figure out the total amount of space we'll
6435 * need (sz), and see whether all argument are strings.
6436 */
6437 sz = 0;
6438 for (i = 0; i < seqlen; i++) {
6439 const Py_ssize_t old_sz = sz;
6440 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 if (!PyUnicode_Check(item)) {
6442 PyErr_Format(PyExc_TypeError,
6443 "sequence item %zd: expected str instance,"
6444 " %.80s found",
6445 i, Py_TYPE(item)->tp_name);
6446 goto onError;
6447 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006448 sz += PyUnicode_GET_SIZE(item);
6449 if (i != 0)
6450 sz += seplen;
6451 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6452 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006454 goto onError;
6455 }
6456 }
Tim Petersced69f82003-09-16 20:30:58 +00006457
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006458 res = _PyUnicode_New(sz);
6459 if (res == NULL)
6460 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006461
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006462 /* Catenate everything. */
6463 res_p = PyUnicode_AS_UNICODE(res);
6464 for (i = 0; i < seqlen; ++i) {
6465 Py_ssize_t itemlen;
6466 item = items[i];
6467 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* Copy item, and maybe the separator. */
6469 if (i) {
6470 Py_UNICODE_COPY(res_p, sep, seplen);
6471 res_p += seplen;
6472 }
6473 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6474 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006475 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006476
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006478 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return (PyObject *)res;
6480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006482 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006483 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 return NULL;
6485}
6486
Tim Petersced69f82003-09-16 20:30:58 +00006487static
6488PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 Py_ssize_t left,
6490 Py_ssize_t right,
6491 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
6493 PyUnicodeObject *u;
6494
6495 if (left < 0)
6496 left = 0;
6497 if (right < 0)
6498 right = 0;
6499
Tim Peters7a29bd52001-09-12 03:03:31 +00006500 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 Py_INCREF(self);
6502 return self;
6503 }
6504
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006505 if (left > PY_SSIZE_T_MAX - self->length ||
6506 right > PY_SSIZE_T_MAX - (left + self->length)) {
6507 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6508 return NULL;
6509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 u = _PyUnicode_New(left + self->length + right);
6511 if (u) {
6512 if (left)
6513 Py_UNICODE_FILL(u->str, fill, left);
6514 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6515 if (right)
6516 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6517 }
6518
6519 return u;
6520}
6521
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006522PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
6526 string = PyUnicode_FromObject(string);
6527 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006530 list = stringlib_splitlines(
6531 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6532 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534 Py_DECREF(string);
6535 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536}
6537
Tim Petersced69f82003-09-16 20:30:58 +00006538static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 PyUnicodeObject *substring,
6541 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006544 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006547 return stringlib_split_whitespace(
6548 (PyObject*) self, self->str, self->length, maxcount
6549 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006551 return stringlib_split(
6552 (PyObject*) self, self->str, self->length,
6553 substring->str, substring->length,
6554 maxcount
6555 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556}
6557
Tim Petersced69f82003-09-16 20:30:58 +00006558static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006559PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 PyUnicodeObject *substring,
6561 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006562{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006563 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006564 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006565
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006566 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006567 return stringlib_rsplit_whitespace(
6568 (PyObject*) self, self->str, self->length, maxcount
6569 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006570
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006571 return stringlib_rsplit(
6572 (PyObject*) self, self->str, self->length,
6573 substring->str, substring->length,
6574 maxcount
6575 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006576}
6577
6578static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 PyUnicodeObject *str1,
6581 PyUnicodeObject *str2,
6582 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
6584 PyUnicodeObject *u;
6585
6586 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006588 else if (maxcount == 0 || self->length == 0)
6589 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006592 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006593 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006594 if (str1->length == 0)
6595 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006596 if (str1->length == 1) {
6597 /* replace characters */
6598 Py_UNICODE u1, u2;
6599 if (!findchar(self->str, self->length, str1->str[0]))
6600 goto nothing;
6601 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6602 if (!u)
6603 return NULL;
6604 Py_UNICODE_COPY(u->str, self->str, self->length);
6605 u1 = str1->str[0];
6606 u2 = str2->str[0];
6607 for (i = 0; i < u->length; i++)
6608 if (u->str[i] == u1) {
6609 if (--maxcount < 0)
6610 break;
6611 u->str[i] = u2;
6612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006614 i = stringlib_find(
6615 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006617 if (i < 0)
6618 goto nothing;
6619 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6620 if (!u)
6621 return NULL;
6622 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006623
6624 /* change everything in-place, starting with this one */
6625 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6626 i += str1->length;
6627
6628 while ( --maxcount > 0) {
6629 i = stringlib_find(self->str+i, self->length-i,
6630 str1->str, str1->length,
6631 i);
6632 if (i == -1)
6633 break;
6634 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6635 i += str1->length;
6636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006639
6640 Py_ssize_t n, i, j, e;
6641 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 Py_UNICODE *p;
6643
6644 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006645 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6646 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006647 if (n == 0)
6648 goto nothing;
6649 /* new_size = self->length + n * (str2->length - str1->length)); */
6650 delta = (str2->length - str1->length);
6651 if (delta == 0) {
6652 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006654 product = n * (str2->length - str1->length);
6655 if ((product / (str2->length - str1->length)) != n) {
6656 PyErr_SetString(PyExc_OverflowError,
6657 "replace string is too long");
6658 return NULL;
6659 }
6660 new_size = self->length + product;
6661 if (new_size < 0) {
6662 PyErr_SetString(PyExc_OverflowError,
6663 "replace string is too long");
6664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 }
6666 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006667 u = _PyUnicode_New(new_size);
6668 if (!u)
6669 return NULL;
6670 i = 0;
6671 p = u->str;
6672 e = self->length - str1->length;
6673 if (str1->length > 0) {
6674 while (n-- > 0) {
6675 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006676 j = stringlib_find(self->str+i, self->length-i,
6677 str1->str, str1->length,
6678 i);
6679 if (j == -1)
6680 break;
6681 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 /* copy unchanged part [i:j] */
6683 Py_UNICODE_COPY(p, self->str+i, j-i);
6684 p += j - i;
6685 }
6686 /* copy substitution string */
6687 if (str2->length > 0) {
6688 Py_UNICODE_COPY(p, str2->str, str2->length);
6689 p += str2->length;
6690 }
6691 i = j + str1->length;
6692 }
6693 if (i < self->length)
6694 /* copy tail [i:] */
6695 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6696 } else {
6697 /* interleave */
6698 while (n > 0) {
6699 Py_UNICODE_COPY(p, str2->str, str2->length);
6700 p += str2->length;
6701 if (--n <= 0)
6702 break;
6703 *p++ = self->str[i++];
6704 }
6705 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006709
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006711 /* nothing to replace; return original string (when possible) */
6712 if (PyUnicode_CheckExact(self)) {
6713 Py_INCREF(self);
6714 return (PyObject *) self;
6715 }
6716 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
6719/* --- Unicode Object Methods --------------------------------------------- */
6720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723\n\
6724Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
6727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006728unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 return fixup(self, fixtitle);
6731}
6732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735\n\
6736Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006737have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
6739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006740unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 return fixup(self, fixcapitalize);
6743}
6744
6745#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748\n\
6749Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006753unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 PyObject *list;
6756 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006757 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 /* Split into words */
6760 list = split(self, NULL, -1);
6761 if (!list)
6762 return NULL;
6763
6764 /* Capitalize each word */
6765 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6766 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 if (item == NULL)
6769 goto onError;
6770 Py_DECREF(PyList_GET_ITEM(list, i));
6771 PyList_SET_ITEM(list, i, item);
6772 }
6773
6774 /* Join the words to form a new string */
6775 item = PyUnicode_Join(NULL, list);
6776
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 Py_DECREF(list);
6779 return (PyObject *)item;
6780}
6781#endif
6782
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006783/* Argument converter. Coerces to a single unicode character */
6784
6785static int
6786convert_uc(PyObject *obj, void *addr)
6787{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6789 PyObject *uniobj;
6790 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006791
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 uniobj = PyUnicode_FromObject(obj);
6793 if (uniobj == NULL) {
6794 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006796 return 0;
6797 }
6798 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6799 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006801 Py_DECREF(uniobj);
6802 return 0;
6803 }
6804 unistr = PyUnicode_AS_UNICODE(uniobj);
6805 *fillcharloc = unistr[0];
6806 Py_DECREF(uniobj);
6807 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006808}
6809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006813Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006814done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
6816static PyObject *
6817unicode_center(PyUnicodeObject *self, PyObject *args)
6818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006819 Py_ssize_t marg, left;
6820 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006821 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Thomas Woutersde017742006-02-16 19:34:37 +00006823 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 return NULL;
6825
Tim Peters7a29bd52001-09-12 03:03:31 +00006826 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 Py_INCREF(self);
6828 return (PyObject*) self;
6829 }
6830
6831 marg = width - self->length;
6832 left = marg / 2 + (marg & width & 1);
6833
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006834 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
Marc-André Lemburge5034372000-08-08 08:04:29 +00006837#if 0
6838
6839/* This code should go into some future Unicode collation support
6840 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006841 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006842
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006843/* speedy UTF-16 code point order comparison */
6844/* gleaned from: */
6845/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6846
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006847static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006848{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006849 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006850 0, 0, 0, 0, 0, 0, 0, 0,
6851 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006852 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006853};
6854
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855static int
6856unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6857{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006858 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 Py_UNICODE *s1 = str1->str;
6861 Py_UNICODE *s2 = str2->str;
6862
6863 len1 = str1->length;
6864 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006867 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006868
6869 c1 = *s1++;
6870 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 if (c1 > (1<<11) * 26)
6873 c1 += utf16Fixup[c1>>11];
6874 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006875 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006876 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006877
6878 if (c1 != c2)
6879 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006880
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006881 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
6883
6884 return (len1 < len2) ? -1 : (len1 != len2);
6885}
6886
Marc-André Lemburge5034372000-08-08 08:04:29 +00006887#else
6888
6889static int
6890unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6891{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006893
6894 Py_UNICODE *s1 = str1->str;
6895 Py_UNICODE *s2 = str2->str;
6896
6897 len1 = str1->length;
6898 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006899
Marc-André Lemburge5034372000-08-08 08:04:29 +00006900 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006901 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006902
Fredrik Lundh45714e92001-06-26 16:39:36 +00006903 c1 = *s1++;
6904 c2 = *s2++;
6905
6906 if (c1 != c2)
6907 return (c1 < c2) ? -1 : 1;
6908
Marc-André Lemburge5034372000-08-08 08:04:29 +00006909 len1--; len2--;
6910 }
6911
6912 return (len1 < len2) ? -1 : (len1 != len2);
6913}
6914
6915#endif
6916
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006920 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6921 return unicode_compare((PyUnicodeObject *)left,
6922 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006923 PyErr_Format(PyExc_TypeError,
6924 "Can't compare %.100s and %.100s",
6925 left->ob_type->tp_name,
6926 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 return -1;
6928}
6929
Martin v. Löwis5b222132007-06-10 09:51:05 +00006930int
6931PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6932{
6933 int i;
6934 Py_UNICODE *id;
6935 assert(PyUnicode_Check(uni));
6936 id = PyUnicode_AS_UNICODE(uni);
6937 /* Compare Unicode string and source character set string */
6938 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 if (id[i] != str[i])
6940 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006941 /* This check keeps Python strings that end in '\0' from comparing equal
6942 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006943 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006945 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006947 return 0;
6948}
6949
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006950
Benjamin Peterson29060642009-01-31 22:14:21 +00006951#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006952 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006953
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006954PyObject *PyUnicode_RichCompare(PyObject *left,
6955 PyObject *right,
6956 int op)
6957{
6958 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006959
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006960 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6961 PyObject *v;
6962 if (((PyUnicodeObject *) left)->length !=
6963 ((PyUnicodeObject *) right)->length) {
6964 if (op == Py_EQ) {
6965 Py_INCREF(Py_False);
6966 return Py_False;
6967 }
6968 if (op == Py_NE) {
6969 Py_INCREF(Py_True);
6970 return Py_True;
6971 }
6972 }
6973 if (left == right)
6974 result = 0;
6975 else
6976 result = unicode_compare((PyUnicodeObject *)left,
6977 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006978
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006979 /* Convert the return value to a Boolean */
6980 switch (op) {
6981 case Py_EQ:
6982 v = TEST_COND(result == 0);
6983 break;
6984 case Py_NE:
6985 v = TEST_COND(result != 0);
6986 break;
6987 case Py_LE:
6988 v = TEST_COND(result <= 0);
6989 break;
6990 case Py_GE:
6991 v = TEST_COND(result >= 0);
6992 break;
6993 case Py_LT:
6994 v = TEST_COND(result == -1);
6995 break;
6996 case Py_GT:
6997 v = TEST_COND(result == 1);
6998 break;
6999 default:
7000 PyErr_BadArgument();
7001 return NULL;
7002 }
7003 Py_INCREF(v);
7004 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007005 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007006
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007007 Py_INCREF(Py_NotImplemented);
7008 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007009}
7010
Guido van Rossum403d68b2000-03-13 15:55:09 +00007011int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007013{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007015 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007016
7017 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 sub = PyUnicode_FromObject(element);
7019 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 PyErr_Format(PyExc_TypeError,
7021 "'in <string>' requires string as left operand, not %s",
7022 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007023 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007024 }
7025
Thomas Wouters477c8d52006-05-27 19:21:47 +00007026 str = PyUnicode_FromObject(container);
7027 if (!str) {
7028 Py_DECREF(sub);
7029 return -1;
7030 }
7031
7032 result = stringlib_contains_obj(str, sub);
7033
7034 Py_DECREF(str);
7035 Py_DECREF(sub);
7036
Guido van Rossum403d68b2000-03-13 15:55:09 +00007037 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007038}
7039
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040/* Concat to string or Unicode object giving a new Unicode object. */
7041
7042PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044{
7045 PyUnicodeObject *u = NULL, *v = NULL, *w;
7046
7047 /* Coerce the two arguments */
7048 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7049 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7052 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
7055 /* Shortcuts */
7056 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 Py_DECREF(v);
7058 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 }
7060 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 Py_DECREF(u);
7062 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 }
7064
7065 /* Concat the two Unicode strings */
7066 w = _PyUnicode_New(u->length + v->length);
7067 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 Py_UNICODE_COPY(w->str, u->str, u->length);
7070 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7071
7072 Py_DECREF(u);
7073 Py_DECREF(v);
7074 return (PyObject *)w;
7075
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 Py_XDECREF(u);
7078 Py_XDECREF(v);
7079 return NULL;
7080}
7081
Walter Dörwald1ab83302007-05-18 17:15:44 +00007082void
7083PyUnicode_Append(PyObject **pleft, PyObject *right)
7084{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007085 PyObject *new;
7086 if (*pleft == NULL)
7087 return;
7088 if (right == NULL || !PyUnicode_Check(*pleft)) {
7089 Py_DECREF(*pleft);
7090 *pleft = NULL;
7091 return;
7092 }
7093 new = PyUnicode_Concat(*pleft, right);
7094 Py_DECREF(*pleft);
7095 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007096}
7097
7098void
7099PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7100{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007101 PyUnicode_Append(pleft, right);
7102 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007103}
7104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007105PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007108Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007109string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007110interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112static PyObject *
7113unicode_count(PyUnicodeObject *self, PyObject *args)
7114{
7115 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007116 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007117 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 PyObject *result;
7119
Guido van Rossumb8872e62000-05-09 14:14:27 +00007120 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return NULL;
7123
7124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007125 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007128
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007129 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007130 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007131 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007132 substring->str, substring->length,
7133 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007134 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135
7136 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007137
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 return result;
7139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007144Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007145to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007146handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7148'xmlcharrefreplace' as well as any other name registered with\n\
7149codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007152unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007154 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 char *encoding = NULL;
7156 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007157 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007158
Benjamin Peterson308d6372009-09-18 21:42:35 +00007159 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7160 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007162 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007163 if (v == NULL)
7164 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007165 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007166 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007167 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007168 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007169 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007170 Py_DECREF(v);
7171 return NULL;
7172 }
7173 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007174
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007176 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007177}
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181\n\
7182Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
7185static PyObject*
7186unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7187{
7188 Py_UNICODE *e;
7189 Py_UNICODE *p;
7190 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007191 Py_UNICODE *qe;
7192 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 PyUnicodeObject *u;
7194 int tabsize = 8;
7195
7196 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
Thomas Wouters7e474022000-07-16 12:04:32 +00007199 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007200 i = 0; /* chars up to and including most recent \n or \r */
7201 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7202 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 for (p = self->str; p < e; p++)
7204 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 if (tabsize > 0) {
7206 incr = tabsize - (j % tabsize); /* cannot overflow */
7207 if (j > PY_SSIZE_T_MAX - incr)
7208 goto overflow1;
7209 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007210 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 if (j > PY_SSIZE_T_MAX - 1)
7214 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 j++;
7216 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 if (i > PY_SSIZE_T_MAX - j)
7218 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007220 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 }
7222 }
7223
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007224 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007226
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 /* Second pass: create output string and fill it */
7228 u = _PyUnicode_New(i + j);
7229 if (!u)
7230 return NULL;
7231
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007232 j = 0; /* same as in first pass */
7233 q = u->str; /* next output char */
7234 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236 for (p = self->str; p < e; p++)
7237 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 if (tabsize > 0) {
7239 i = tabsize - (j % tabsize);
7240 j += i;
7241 while (i--) {
7242 if (q >= qe)
7243 goto overflow2;
7244 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007245 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007247 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 else {
7249 if (q >= qe)
7250 goto overflow2;
7251 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007252 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 if (*p == '\n' || *p == '\r')
7254 j = 0;
7255 }
7256
7257 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007258
7259 overflow2:
7260 Py_DECREF(u);
7261 overflow1:
7262 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264}
7265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007266PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268\n\
7269Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007270such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271arguments start and end are interpreted as in slice notation.\n\
7272\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007273Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
7275static PyObject *
7276unicode_find(PyUnicodeObject *self, PyObject *args)
7277{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007278 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007279 Py_ssize_t start;
7280 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007281 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
Christian Heimes9cd17752007-11-18 19:35:23 +00007283 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
Thomas Wouters477c8d52006-05-27 19:21:47 +00007286 result = stringlib_find_slice(
7287 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7288 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7289 start, end
7290 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291
7292 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007293
Christian Heimes217cfd12007-12-02 14:31:20 +00007294 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295}
7296
7297static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007298unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299{
7300 if (index < 0 || index >= self->length) {
7301 PyErr_SetString(PyExc_IndexError, "string index out of range");
7302 return NULL;
7303 }
7304
7305 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7306}
7307
Guido van Rossumc2504932007-09-18 19:42:40 +00007308/* Believe it or not, this produces the same value for ASCII strings
7309 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007311unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312{
Guido van Rossumc2504932007-09-18 19:42:40 +00007313 Py_ssize_t len;
7314 Py_UNICODE *p;
7315 long x;
7316
7317 if (self->hash != -1)
7318 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007319 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007320 p = self->str;
7321 x = *p << 7;
7322 while (--len >= 0)
7323 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007324 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007325 if (x == -1)
7326 x = -2;
7327 self->hash = x;
7328 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329}
7330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007331PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007334Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
7336static PyObject *
7337unicode_index(PyUnicodeObject *self, PyObject *args)
7338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007339 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007340 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007341 Py_ssize_t start;
7342 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
Christian Heimes9cd17752007-11-18 19:35:23 +00007344 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347 result = stringlib_find_slice(
7348 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7349 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7350 start, end
7351 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
7353 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 if (result < 0) {
7356 PyErr_SetString(PyExc_ValueError, "substring not found");
7357 return NULL;
7358 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007359
Christian Heimes217cfd12007-12-02 14:31:20 +00007360 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361}
7362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007363PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007366Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007367at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368
7369static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007370unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371{
7372 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7373 register const Py_UNICODE *e;
7374 int cased;
7375
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 /* Shortcut for single character strings */
7377 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007380 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007381 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007383
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 e = p + PyUnicode_GET_SIZE(self);
7385 cased = 0;
7386 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007388
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7390 return PyBool_FromLong(0);
7391 else if (!cased && Py_UNICODE_ISLOWER(ch))
7392 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007394 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395}
7396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007400Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007401at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
7403static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007404unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405{
7406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7407 register const Py_UNICODE *e;
7408 int cased;
7409
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 /* Shortcut for single character strings */
7411 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007414 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007415 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007417
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 e = p + PyUnicode_GET_SIZE(self);
7419 cased = 0;
7420 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007422
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7424 return PyBool_FromLong(0);
7425 else if (!cased && Py_UNICODE_ISUPPER(ch))
7426 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007428 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429}
7430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007434Return True if S is a titlecased string and there is at least one\n\
7435character in S, i.e. upper- and titlecase characters may only\n\
7436follow uncased characters and lowercase characters only cased ones.\n\
7437Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
7439static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007440unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441{
7442 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7443 register const Py_UNICODE *e;
7444 int cased, previous_is_cased;
7445
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 /* Shortcut for single character strings */
7447 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7449 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007451 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007452 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007454
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 e = p + PyUnicode_GET_SIZE(self);
7456 cased = 0;
7457 previous_is_cased = 0;
7458 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007460
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7462 if (previous_is_cased)
7463 return PyBool_FromLong(0);
7464 previous_is_cased = 1;
7465 cased = 1;
7466 }
7467 else if (Py_UNICODE_ISLOWER(ch)) {
7468 if (!previous_is_cased)
7469 return PyBool_FromLong(0);
7470 previous_is_cased = 1;
7471 cased = 1;
7472 }
7473 else
7474 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007476 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007482Return True if all characters in S are whitespace\n\
7483and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
7485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007486unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487{
7488 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7489 register const Py_UNICODE *e;
7490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 /* Shortcut for single character strings */
7492 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 Py_UNICODE_ISSPACE(*p))
7494 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007496 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007497 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 e = p + PyUnicode_GET_SIZE(self);
7501 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (!Py_UNICODE_ISSPACE(*p))
7503 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007505 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506}
7507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007510\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007511Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007512and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007513
7514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007516{
7517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7518 register const Py_UNICODE *e;
7519
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007520 /* Shortcut for single character strings */
7521 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 Py_UNICODE_ISALPHA(*p))
7523 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007524
7525 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007526 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007528
7529 e = p + PyUnicode_GET_SIZE(self);
7530 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (!Py_UNICODE_ISALPHA(*p))
7532 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007534 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007535}
7536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007540Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007542
7543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007545{
7546 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7547 register const Py_UNICODE *e;
7548
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549 /* Shortcut for single character strings */
7550 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 Py_UNICODE_ISALNUM(*p))
7552 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007553
7554 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007555 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007557
7558 e = p + PyUnicode_GET_SIZE(self);
7559 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (!Py_UNICODE_ISALNUM(*p))
7561 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007563 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007569Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007573unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
7575 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7576 register const Py_UNICODE *e;
7577
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 /* Shortcut for single character strings */
7579 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 Py_UNICODE_ISDECIMAL(*p))
7581 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007583 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007584 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007586
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 e = p + PyUnicode_GET_SIZE(self);
7588 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (!Py_UNICODE_ISDECIMAL(*p))
7590 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007592 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007598Return True if all characters in S are digits\n\
7599and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007602unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603{
7604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7605 register const Py_UNICODE *e;
7606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 /* Shortcut for single character strings */
7608 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 Py_UNICODE_ISDIGIT(*p))
7610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007612 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007613 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 e = p + PyUnicode_GET_SIZE(self);
7617 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 if (!Py_UNICODE_ISDIGIT(*p))
7619 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622}
7623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007624PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007627Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007631unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
7633 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7634 register const Py_UNICODE *e;
7635
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 /* Shortcut for single character strings */
7637 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 Py_UNICODE_ISNUMERIC(*p))
7639 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007641 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007642 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007644
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 e = p + PyUnicode_GET_SIZE(self);
7646 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 if (!Py_UNICODE_ISNUMERIC(*p))
7648 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007650 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651}
7652
Martin v. Löwis47383402007-08-15 07:32:56 +00007653int
7654PyUnicode_IsIdentifier(PyObject *self)
7655{
7656 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7657 register const Py_UNICODE *e;
7658
7659 /* Special case for empty strings */
7660 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007662
7663 /* PEP 3131 says that the first character must be in
7664 XID_Start and subsequent characters in XID_Continue,
7665 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007667 letters, digits, underscore). However, given the current
7668 definition of XID_Start and XID_Continue, it is sufficient
7669 to check just for these, except that _ must be allowed
7670 as starting an identifier. */
7671 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7672 return 0;
7673
7674 e = p + PyUnicode_GET_SIZE(self);
7675 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (!_PyUnicode_IsXidContinue(*p))
7677 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007678 }
7679 return 1;
7680}
7681
7682PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007684\n\
7685Return True if S is a valid identifier according\n\
7686to the language definition.");
7687
7688static PyObject*
7689unicode_isidentifier(PyObject *self)
7690{
7691 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7692}
7693
Georg Brandl559e5d72008-06-11 18:37:52 +00007694PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007696\n\
7697Return True if all characters in S are considered\n\
7698printable in repr() or S is empty, False otherwise.");
7699
7700static PyObject*
7701unicode_isprintable(PyObject *self)
7702{
7703 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7704 register const Py_UNICODE *e;
7705
7706 /* Shortcut for single character strings */
7707 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7708 Py_RETURN_TRUE;
7709 }
7710
7711 e = p + PyUnicode_GET_SIZE(self);
7712 for (; p < e; p++) {
7713 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7714 Py_RETURN_FALSE;
7715 }
7716 }
7717 Py_RETURN_TRUE;
7718}
7719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007721 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722\n\
7723Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007724iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
7726static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007727unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007729 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730}
7731
Martin v. Löwis18e16552006-02-15 17:27:45 +00007732static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733unicode_length(PyUnicodeObject *self)
7734{
7735 return self->length;
7736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007741Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007742done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
7744static PyObject *
7745unicode_ljust(PyUnicodeObject *self, PyObject *args)
7746{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007747 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007748 Py_UNICODE fillchar = ' ';
7749
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007750 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 return NULL;
7752
Tim Peters7a29bd52001-09-12 03:03:31 +00007753 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 Py_INCREF(self);
7755 return (PyObject*) self;
7756 }
7757
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007758 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759}
7760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007761PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 return fixup(self, fixlower);
7770}
7771
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007772#define LEFTSTRIP 0
7773#define RIGHTSTRIP 1
7774#define BOTHSTRIP 2
7775
7776/* Arrays indexed by above */
7777static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7778
7779#define STRIPNAME(i) (stripformat[i]+3)
7780
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007781/* externally visible for str.strip(unicode) */
7782PyObject *
7783_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7784{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7786 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7787 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7788 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7789 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007790
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007792
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 i = 0;
7794 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7796 i++;
7797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007798 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007799
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 j = len;
7801 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 do {
7803 j--;
7804 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7805 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007806 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007807
Benjamin Peterson14339b62009-01-31 16:36:08 +00007808 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 Py_INCREF(self);
7810 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007811 }
7812 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814}
7815
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816
7817static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007818do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7821 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 i = 0;
7824 if (striptype != RIGHTSTRIP) {
7825 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7826 i++;
7827 }
7828 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007829
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 j = len;
7831 if (striptype != LEFTSTRIP) {
7832 do {
7833 j--;
7834 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7835 j++;
7836 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007837
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7839 Py_INCREF(self);
7840 return (PyObject*)self;
7841 }
7842 else
7843 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844}
7845
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007846
7847static PyObject *
7848do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7853 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007854
Benjamin Peterson14339b62009-01-31 16:36:08 +00007855 if (sep != NULL && sep != Py_None) {
7856 if (PyUnicode_Check(sep))
7857 return _PyUnicode_XStrip(self, striptype, sep);
7858 else {
7859 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 "%s arg must be None or str",
7861 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 return NULL;
7863 }
7864 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007865
Benjamin Peterson14339b62009-01-31 16:36:08 +00007866 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007867}
7868
7869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007870PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007872\n\
7873Return a copy of the string S with leading and trailing\n\
7874whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007875If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876
7877static PyObject *
7878unicode_strip(PyUnicodeObject *self, PyObject *args)
7879{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 if (PyTuple_GET_SIZE(args) == 0)
7881 return do_strip(self, BOTHSTRIP); /* Common case */
7882 else
7883 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884}
7885
7886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007889\n\
7890Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007891If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892
7893static PyObject *
7894unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 if (PyTuple_GET_SIZE(args) == 0)
7897 return do_strip(self, LEFTSTRIP); /* Common case */
7898 else
7899 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007900}
7901
7902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007903PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905\n\
7906Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007907If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908
7909static PyObject *
7910unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7911{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007912 if (PyTuple_GET_SIZE(args) == 0)
7913 return do_strip(self, RIGHTSTRIP); /* Common case */
7914 else
7915 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007916}
7917
7918
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007920unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921{
7922 PyUnicodeObject *u;
7923 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007924 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007925 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926
Georg Brandl222de0f2009-04-12 12:01:50 +00007927 if (len < 1) {
7928 Py_INCREF(unicode_empty);
7929 return (PyObject *)unicode_empty;
7930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931
Tim Peters7a29bd52001-09-12 03:03:31 +00007932 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 /* no repeat, return original string */
7934 Py_INCREF(str);
7935 return (PyObject*) str;
7936 }
Tim Peters8f422462000-09-09 06:13:41 +00007937
7938 /* ensure # of chars needed doesn't overflow int and # of bytes
7939 * needed doesn't overflow size_t
7940 */
7941 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007942 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007943 PyErr_SetString(PyExc_OverflowError,
7944 "repeated string is too long");
7945 return NULL;
7946 }
7947 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7948 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7949 PyErr_SetString(PyExc_OverflowError,
7950 "repeated string is too long");
7951 return NULL;
7952 }
7953 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 if (!u)
7955 return NULL;
7956
7957 p = u->str;
7958
Georg Brandl222de0f2009-04-12 12:01:50 +00007959 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960 Py_UNICODE_FILL(p, str->str[0], len);
7961 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007962 Py_ssize_t done = str->length; /* number of characters copied this far */
7963 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007965 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007966 Py_UNICODE_COPY(p+done, p, n);
7967 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
7970
7971 return (PyObject*) u;
7972}
7973
7974PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 PyObject *subobj,
7976 PyObject *replobj,
7977 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
7979 PyObject *self;
7980 PyObject *str1;
7981 PyObject *str2;
7982 PyObject *result;
7983
7984 self = PyUnicode_FromObject(obj);
7985 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 str1 = PyUnicode_FromObject(subobj);
7988 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 Py_DECREF(self);
7990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
7992 str2 = PyUnicode_FromObject(replobj);
7993 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 Py_DECREF(self);
7995 Py_DECREF(str1);
7996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
Tim Petersced69f82003-09-16 20:30:58 +00007998 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 (PyUnicodeObject *)str1,
8000 (PyUnicodeObject *)str2,
8001 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 Py_DECREF(self);
8003 Py_DECREF(str1);
8004 Py_DECREF(str2);
8005 return result;
8006}
8007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008008PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008009 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010\n\
8011Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008012old replaced by new. If the optional argument count is\n\
8013given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014
8015static PyObject*
8016unicode_replace(PyUnicodeObject *self, PyObject *args)
8017{
8018 PyUnicodeObject *str1;
8019 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 PyObject *result;
8022
Martin v. Löwis18e16552006-02-15 17:27:45 +00008023 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return NULL;
8025 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8026 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008029 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 Py_DECREF(str1);
8031 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
8034 result = replace(self, str1, str2, maxcount);
8035
8036 Py_DECREF(str1);
8037 Py_DECREF(str2);
8038 return result;
8039}
8040
8041static
8042PyObject *unicode_repr(PyObject *unicode)
8043{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008044 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008045 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008046 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8047 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8048
8049 /* XXX(nnorwitz): rather than over-allocating, it would be
8050 better to choose a different scheme. Perhaps scan the
8051 first N-chars of the string and allocate based on that size.
8052 */
8053 /* Initial allocation is based on the longest-possible unichr
8054 escape.
8055
8056 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8057 unichr, so in this case it's the longest unichr escape. In
8058 narrow (UTF-16) builds this is five chars per source unichr
8059 since there are two unichrs in the surrogate pair, so in narrow
8060 (UTF-16) builds it's not the longest unichr escape.
8061
8062 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8063 so in the narrow (UTF-16) build case it's the longest unichr
8064 escape.
8065 */
8066
Walter Dörwald1ab83302007-05-18 17:15:44 +00008067 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008069#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008071#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008073#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008075 if (repr == NULL)
8076 return NULL;
8077
Walter Dörwald1ab83302007-05-18 17:15:44 +00008078 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008079
8080 /* Add quote */
8081 *p++ = (findchar(s, size, '\'') &&
8082 !findchar(s, size, '"')) ? '"' : '\'';
8083 while (size-- > 0) {
8084 Py_UNICODE ch = *s++;
8085
8086 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008087 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008088 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008089 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008090 continue;
8091 }
8092
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008094 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008095 *p++ = '\\';
8096 *p++ = 't';
8097 }
8098 else if (ch == '\n') {
8099 *p++ = '\\';
8100 *p++ = 'n';
8101 }
8102 else if (ch == '\r') {
8103 *p++ = '\\';
8104 *p++ = 'r';
8105 }
8106
8107 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008108 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008109 *p++ = '\\';
8110 *p++ = 'x';
8111 *p++ = hexdigits[(ch >> 4) & 0x000F];
8112 *p++ = hexdigits[ch & 0x000F];
8113 }
8114
Georg Brandl559e5d72008-06-11 18:37:52 +00008115 /* Copy ASCII characters as-is */
8116 else if (ch < 0x7F) {
8117 *p++ = ch;
8118 }
8119
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008121 else {
8122 Py_UCS4 ucs = ch;
8123
8124#ifndef Py_UNICODE_WIDE
8125 Py_UNICODE ch2 = 0;
8126 /* Get code point from surrogate pair */
8127 if (size > 0) {
8128 ch2 = *s;
8129 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008134 size--;
8135 }
8136 }
8137#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008139 (categories Z* and C* except ASCII space)
8140 */
8141 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8142 /* Map 8-bit characters to '\xhh' */
8143 if (ucs <= 0xff) {
8144 *p++ = '\\';
8145 *p++ = 'x';
8146 *p++ = hexdigits[(ch >> 4) & 0x000F];
8147 *p++ = hexdigits[ch & 0x000F];
8148 }
8149 /* Map 21-bit characters to '\U00xxxxxx' */
8150 else if (ucs >= 0x10000) {
8151 *p++ = '\\';
8152 *p++ = 'U';
8153 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8154 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8155 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8156 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8157 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8158 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8159 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8160 *p++ = hexdigits[ucs & 0x0000000F];
8161 }
8162 /* Map 16-bit characters to '\uxxxx' */
8163 else {
8164 *p++ = '\\';
8165 *p++ = 'u';
8166 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8167 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8168 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8169 *p++ = hexdigits[ucs & 0x000F];
8170 }
8171 }
8172 /* Copy characters as-is */
8173 else {
8174 *p++ = ch;
8175#ifndef Py_UNICODE_WIDE
8176 if (ucs >= 0x10000)
8177 *p++ = ch2;
8178#endif
8179 }
8180 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008181 }
8182 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008183 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008184
8185 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008186 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188}
8189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008190PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192\n\
8193Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008194such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195arguments start and end are interpreted as in slice notation.\n\
8196\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008197Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198
8199static PyObject *
8200unicode_rfind(PyUnicodeObject *self, PyObject *args)
8201{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008202 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008203 Py_ssize_t start;
8204 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008205 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206
Christian Heimes9cd17752007-11-18 19:35:23 +00008207 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210 result = stringlib_rfind_slice(
8211 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8212 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8213 start, end
8214 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215
8216 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008217
Christian Heimes217cfd12007-12-02 14:31:20 +00008218 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219}
8220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008221PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008224Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
8226static PyObject *
8227unicode_rindex(PyUnicodeObject *self, PyObject *args)
8228{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008230 Py_ssize_t start;
8231 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
Christian Heimes9cd17752007-11-18 19:35:23 +00008234 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
Thomas Wouters477c8d52006-05-27 19:21:47 +00008237 result = stringlib_rfind_slice(
8238 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8239 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8240 start, end
8241 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
8243 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 if (result < 0) {
8246 PyErr_SetString(PyExc_ValueError, "substring not found");
8247 return NULL;
8248 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008249 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250}
8251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008252PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008255Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008256done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
8258static PyObject *
8259unicode_rjust(PyUnicodeObject *self, PyObject *args)
8260{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008261 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008262 Py_UNICODE fillchar = ' ';
8263
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008264 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 return NULL;
8266
Tim Peters7a29bd52001-09-12 03:03:31 +00008267 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 Py_INCREF(self);
8269 return (PyObject*) self;
8270 }
8271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008272 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273}
8274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 PyObject *sep,
8277 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278{
8279 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008280
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 s = PyUnicode_FromObject(s);
8282 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 if (sep != NULL) {
8285 sep = PyUnicode_FromObject(sep);
8286 if (sep == NULL) {
8287 Py_DECREF(s);
8288 return NULL;
8289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
8291
8292 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8293
8294 Py_DECREF(s);
8295 Py_XDECREF(sep);
8296 return result;
8297}
8298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008299PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301\n\
8302Return a list of the words in S, using sep as the\n\
8303delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008304splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008305whitespace string is a separator and empty strings are\n\
8306removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307
8308static PyObject*
8309unicode_split(PyUnicodeObject *self, PyObject *args)
8310{
8311 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008312 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
Martin v. Löwis18e16552006-02-15 17:27:45 +00008314 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 return NULL;
8316
8317 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323}
8324
Thomas Wouters477c8d52006-05-27 19:21:47 +00008325PyObject *
8326PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8327{
8328 PyObject* str_obj;
8329 PyObject* sep_obj;
8330 PyObject* out;
8331
8332 str_obj = PyUnicode_FromObject(str_in);
8333 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008335 sep_obj = PyUnicode_FromObject(sep_in);
8336 if (!sep_obj) {
8337 Py_DECREF(str_obj);
8338 return NULL;
8339 }
8340
8341 out = stringlib_partition(
8342 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8343 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8344 );
8345
8346 Py_DECREF(sep_obj);
8347 Py_DECREF(str_obj);
8348
8349 return out;
8350}
8351
8352
8353PyObject *
8354PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8355{
8356 PyObject* str_obj;
8357 PyObject* sep_obj;
8358 PyObject* out;
8359
8360 str_obj = PyUnicode_FromObject(str_in);
8361 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008363 sep_obj = PyUnicode_FromObject(sep_in);
8364 if (!sep_obj) {
8365 Py_DECREF(str_obj);
8366 return NULL;
8367 }
8368
8369 out = stringlib_rpartition(
8370 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8371 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8372 );
8373
8374 Py_DECREF(sep_obj);
8375 Py_DECREF(str_obj);
8376
8377 return out;
8378}
8379
8380PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008382\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008383Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008384the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008385found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008386
8387static PyObject*
8388unicode_partition(PyUnicodeObject *self, PyObject *separator)
8389{
8390 return PyUnicode_Partition((PyObject *)self, separator);
8391}
8392
8393PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008394 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008395\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008396Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008397the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008398separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008399
8400static PyObject*
8401unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8402{
8403 return PyUnicode_RPartition((PyObject *)self, separator);
8404}
8405
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008406PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 PyObject *sep,
8408 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008409{
8410 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008412 s = PyUnicode_FromObject(s);
8413 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 if (sep != NULL) {
8416 sep = PyUnicode_FromObject(sep);
8417 if (sep == NULL) {
8418 Py_DECREF(s);
8419 return NULL;
8420 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008421 }
8422
8423 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8424
8425 Py_DECREF(s);
8426 Py_XDECREF(sep);
8427 return result;
8428}
8429
8430PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008432\n\
8433Return a list of the words in S, using sep as the\n\
8434delimiter string, starting at the end of the string and\n\
8435working to the front. If maxsplit is given, at most maxsplit\n\
8436splits are done. If sep is not specified, any whitespace string\n\
8437is a separator.");
8438
8439static PyObject*
8440unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8441{
8442 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008443 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008444
Martin v. Löwis18e16552006-02-15 17:27:45 +00008445 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008446 return NULL;
8447
8448 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008450 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008452 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008454}
8455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008456PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458\n\
8459Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008460Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008461is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
8463static PyObject*
8464unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8465{
Guido van Rossum86662912000-04-11 15:38:46 +00008466 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467
Guido van Rossum86662912000-04-11 15:38:46 +00008468 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 return NULL;
8470
Guido van Rossum86662912000-04-11 15:38:46 +00008471 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472}
8473
8474static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008475PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476{
Walter Dörwald346737f2007-05-31 10:44:43 +00008477 if (PyUnicode_CheckExact(self)) {
8478 Py_INCREF(self);
8479 return self;
8480 } else
8481 /* Subtype -- return genuine unicode string with the same value. */
8482 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8483 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484}
8485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008486PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488\n\
8489Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008490and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
8492static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008493unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 return fixup(self, fixswapcase);
8496}
8497
Georg Brandlceee0772007-11-27 23:48:05 +00008498PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008500\n\
8501Return a translation table usable for str.translate().\n\
8502If there is only one argument, it must be a dictionary mapping Unicode\n\
8503ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008504Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008505If there are two arguments, they must be strings of equal length, and\n\
8506in the resulting dictionary, each character in x will be mapped to the\n\
8507character at the same position in y. If there is a third argument, it\n\
8508must be a string, whose characters will be mapped to None in the result.");
8509
8510static PyObject*
8511unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8512{
8513 PyObject *x, *y = NULL, *z = NULL;
8514 PyObject *new = NULL, *key, *value;
8515 Py_ssize_t i = 0;
8516 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517
Georg Brandlceee0772007-11-27 23:48:05 +00008518 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8519 return NULL;
8520 new = PyDict_New();
8521 if (!new)
8522 return NULL;
8523 if (y != NULL) {
8524 /* x must be a string too, of equal length */
8525 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8526 if (!PyUnicode_Check(x)) {
8527 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8528 "be a string if there is a second argument");
8529 goto err;
8530 }
8531 if (PyUnicode_GET_SIZE(x) != ylen) {
8532 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8533 "arguments must have equal length");
8534 goto err;
8535 }
8536 /* create entries for translating chars in x to those in y */
8537 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008538 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8539 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008540 if (!key || !value)
8541 goto err;
8542 res = PyDict_SetItem(new, key, value);
8543 Py_DECREF(key);
8544 Py_DECREF(value);
8545 if (res < 0)
8546 goto err;
8547 }
8548 /* create entries for deleting chars in z */
8549 if (z != NULL) {
8550 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008551 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008552 if (!key)
8553 goto err;
8554 res = PyDict_SetItem(new, key, Py_None);
8555 Py_DECREF(key);
8556 if (res < 0)
8557 goto err;
8558 }
8559 }
8560 } else {
8561 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008562 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008563 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8564 "to maketrans it must be a dict");
8565 goto err;
8566 }
8567 /* copy entries into the new dict, converting string keys to int keys */
8568 while (PyDict_Next(x, &i, &key, &value)) {
8569 if (PyUnicode_Check(key)) {
8570 /* convert string keys to integer keys */
8571 PyObject *newkey;
8572 if (PyUnicode_GET_SIZE(key) != 1) {
8573 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8574 "table must be of length 1");
8575 goto err;
8576 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008577 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008578 if (!newkey)
8579 goto err;
8580 res = PyDict_SetItem(new, newkey, value);
8581 Py_DECREF(newkey);
8582 if (res < 0)
8583 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008584 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008585 /* just keep integer keys */
8586 if (PyDict_SetItem(new, key, value) < 0)
8587 goto err;
8588 } else {
8589 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8590 "be strings or integers");
8591 goto err;
8592 }
8593 }
8594 }
8595 return new;
8596 err:
8597 Py_DECREF(new);
8598 return NULL;
8599}
8600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008601PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603\n\
8604Return a copy of the string S, where all characters have been mapped\n\
8605through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008606Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008607Unmapped characters are left untouched. Characters mapped to None\n\
8608are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008611unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612{
Georg Brandlceee0772007-11-27 23:48:05 +00008613 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614}
8615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008616PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008619Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620
8621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008622unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 return fixup(self, fixupper);
8625}
8626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008627PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008630Pad a numeric string S with zeros on the left, to fill a field\n\
8631of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632
8633static PyObject *
8634unicode_zfill(PyUnicodeObject *self, PyObject *args)
8635{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008636 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 PyUnicodeObject *u;
8638
Martin v. Löwis18e16552006-02-15 17:27:45 +00008639 Py_ssize_t width;
8640 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 return NULL;
8642
8643 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008644 if (PyUnicode_CheckExact(self)) {
8645 Py_INCREF(self);
8646 return (PyObject*) self;
8647 }
8648 else
8649 return PyUnicode_FromUnicode(
8650 PyUnicode_AS_UNICODE(self),
8651 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 }
8654
8655 fill = width - self->length;
8656
8657 u = pad(self, fill, 0, '0');
8658
Walter Dörwald068325e2002-04-15 13:36:47 +00008659 if (u == NULL)
8660 return NULL;
8661
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 if (u->str[fill] == '+' || u->str[fill] == '-') {
8663 /* move sign to beginning of string */
8664 u->str[0] = u->str[fill];
8665 u->str[fill] = '0';
8666 }
8667
8668 return (PyObject*) u;
8669}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
8671#if 0
8672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008673unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
Christian Heimes2202f872008-02-06 14:31:34 +00008675 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676}
8677#endif
8678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008679PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008682Return True if S starts with the specified prefix, False otherwise.\n\
8683With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008684With optional end, stop comparing S at that position.\n\
8685prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686
8687static PyObject *
8688unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008691 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008694 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008695 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008697 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8699 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008700 if (PyTuple_Check(subobj)) {
8701 Py_ssize_t i;
8702 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8703 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008705 if (substring == NULL)
8706 return NULL;
8707 result = tailmatch(self, substring, start, end, -1);
8708 Py_DECREF(substring);
8709 if (result) {
8710 Py_RETURN_TRUE;
8711 }
8712 }
8713 /* nothing matched */
8714 Py_RETURN_FALSE;
8715 }
8716 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008719 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008721 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722}
8723
8724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008725PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008728Return True if S ends with the specified suffix, False otherwise.\n\
8729With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730With optional end, stop comparing S at that position.\n\
8731suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732
8733static PyObject *
8734unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008737 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008739 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008740 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008741 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008743 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8745 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008746 if (PyTuple_Check(subobj)) {
8747 Py_ssize_t i;
8748 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8749 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008751 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008753 result = tailmatch(self, substring, start, end, +1);
8754 Py_DECREF(substring);
8755 if (result) {
8756 Py_RETURN_TRUE;
8757 }
8758 }
8759 Py_RETURN_FALSE;
8760 }
8761 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008765 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008767 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768}
8769
Eric Smith8c663262007-08-25 02:26:07 +00008770#include "stringlib/string_format.h"
8771
8772PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008774\n\
8775");
8776
Eric Smith4a7d76d2008-05-30 18:10:19 +00008777static PyObject *
8778unicode__format__(PyObject* self, PyObject* args)
8779{
8780 PyObject *format_spec;
8781
8782 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8783 return NULL;
8784
8785 return _PyUnicode_FormatAdvanced(self,
8786 PyUnicode_AS_UNICODE(format_spec),
8787 PyUnicode_GET_SIZE(format_spec));
8788}
8789
Eric Smith8c663262007-08-25 02:26:07 +00008790PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008792\n\
8793");
8794
8795static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008796unicode__sizeof__(PyUnicodeObject *v)
8797{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008798 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8799 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008800}
8801
8802PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008804
8805static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008806unicode_getnewargs(PyUnicodeObject *v)
8807{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008808 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008809}
8810
8811
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812static PyMethodDef unicode_methods[] = {
8813
8814 /* Order is according to common usage: often used methods should
8815 appear first, since lookup is done sequentially. */
8816
Benjamin Peterson308d6372009-09-18 21:42:35 +00008817 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008818 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8819 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008820 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008821 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8822 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8823 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8824 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8825 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8826 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8827 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008828 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008829 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8830 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8831 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008832 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008833 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8834 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8835 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008836 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008837 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008838 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008839 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008840 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8841 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8842 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8843 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8844 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8845 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8846 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8847 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8848 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8849 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8850 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8851 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8852 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8853 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008854 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008855 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008856 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008857 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008858 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008859 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8860 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008861 {"maketrans", (PyCFunction) unicode_maketrans,
8862 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008863 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008864#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008865 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866#endif
8867
8868#if 0
8869 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008870 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871#endif
8872
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 {NULL, NULL}
8875};
8876
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008877static PyObject *
8878unicode_mod(PyObject *v, PyObject *w)
8879{
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 if (!PyUnicode_Check(v)) {
8881 Py_INCREF(Py_NotImplemented);
8882 return Py_NotImplemented;
8883 }
8884 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008885}
8886
8887static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 0, /*nb_add*/
8889 0, /*nb_subtract*/
8890 0, /*nb_multiply*/
8891 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008892};
8893
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008895 (lenfunc) unicode_length, /* sq_length */
8896 PyUnicode_Concat, /* sq_concat */
8897 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8898 (ssizeargfunc) unicode_getitem, /* sq_item */
8899 0, /* sq_slice */
8900 0, /* sq_ass_item */
8901 0, /* sq_ass_slice */
8902 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903};
8904
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008905static PyObject*
8906unicode_subscript(PyUnicodeObject* self, PyObject* item)
8907{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008908 if (PyIndex_Check(item)) {
8909 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008910 if (i == -1 && PyErr_Occurred())
8911 return NULL;
8912 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008913 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008914 return unicode_getitem(self, i);
8915 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008916 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008917 Py_UNICODE* source_buf;
8918 Py_UNICODE* result_buf;
8919 PyObject* result;
8920
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008921 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008923 return NULL;
8924 }
8925
8926 if (slicelength <= 0) {
8927 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008928 } else if (start == 0 && step == 1 && slicelength == self->length &&
8929 PyUnicode_CheckExact(self)) {
8930 Py_INCREF(self);
8931 return (PyObject *)self;
8932 } else if (step == 1) {
8933 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008934 } else {
8935 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008936 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8937 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008938
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 if (result_buf == NULL)
8940 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008941
8942 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8943 result_buf[i] = source_buf[cur];
8944 }
Tim Petersced69f82003-09-16 20:30:58 +00008945
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008946 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008947 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008948 return result;
8949 }
8950 } else {
8951 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8952 return NULL;
8953 }
8954}
8955
8956static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008957 (lenfunc)unicode_length, /* mp_length */
8958 (binaryfunc)unicode_subscript, /* mp_subscript */
8959 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008960};
8961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963/* Helpers for PyUnicode_Format() */
8964
8965static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008966getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008968 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 (*p_argidx)++;
8971 if (arglen < 0)
8972 return args;
8973 else
8974 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
8976 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 return NULL;
8979}
8980
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008981/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008983static PyObject *
8984formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008986 char *p;
8987 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008989
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 x = PyFloat_AsDouble(v);
8991 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008992 return NULL;
8993
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008996
Eric Smith0923d1d2009-04-16 20:16:10 +00008997 p = PyOS_double_to_string(x, type, prec,
8998 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008999 if (p == NULL)
9000 return NULL;
9001 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009002 PyMem_Free(p);
9003 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004}
9005
Tim Peters38fd5b62000-09-21 05:43:11 +00009006static PyObject*
9007formatlong(PyObject *val, int flags, int prec, int type)
9008{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 char *buf;
9010 int len;
9011 PyObject *str; /* temporary string object. */
9012 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009013
Benjamin Peterson14339b62009-01-31 16:36:08 +00009014 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9015 if (!str)
9016 return NULL;
9017 result = PyUnicode_FromStringAndSize(buf, len);
9018 Py_DECREF(str);
9019 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009020}
9021
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022static int
9023formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009024 size_t buflen,
9025 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009027 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009028 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 if (PyUnicode_GET_SIZE(v) == 1) {
9030 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9031 buf[1] = '\0';
9032 return 1;
9033 }
9034#ifndef Py_UNICODE_WIDE
9035 if (PyUnicode_GET_SIZE(v) == 2) {
9036 /* Decode a valid surrogate pair */
9037 int c0 = PyUnicode_AS_UNICODE(v)[0];
9038 int c1 = PyUnicode_AS_UNICODE(v)[1];
9039 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9040 0xDC00 <= c1 && c1 <= 0xDFFF) {
9041 buf[0] = c0;
9042 buf[1] = c1;
9043 buf[2] = '\0';
9044 return 2;
9045 }
9046 }
9047#endif
9048 goto onError;
9049 }
9050 else {
9051 /* Integer input truncated to a character */
9052 long x;
9053 x = PyLong_AsLong(v);
9054 if (x == -1 && PyErr_Occurred())
9055 goto onError;
9056
9057 if (x < 0 || x > 0x10ffff) {
9058 PyErr_SetString(PyExc_OverflowError,
9059 "%c arg not in range(0x110000)");
9060 return -1;
9061 }
9062
9063#ifndef Py_UNICODE_WIDE
9064 if (x > 0xffff) {
9065 x -= 0x10000;
9066 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9067 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9068 return 2;
9069 }
9070#endif
9071 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009072 buf[1] = '\0';
9073 return 1;
9074 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009075
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009077 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009079 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080}
9081
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009082/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009083 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009084*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009085#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009086
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089{
9090 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009091 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 int args_owned = 0;
9093 PyUnicodeObject *result = NULL;
9094 PyObject *dict = NULL;
9095 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009096
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 PyErr_BadInternalCall();
9099 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 }
9101 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009102 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 fmt = PyUnicode_AS_UNICODE(uformat);
9105 fmtcnt = PyUnicode_GET_SIZE(uformat);
9106
9107 reslen = rescnt = fmtcnt + 100;
9108 result = _PyUnicode_New(reslen);
9109 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 res = PyUnicode_AS_UNICODE(result);
9112
9113 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 arglen = PyTuple_Size(args);
9115 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116 }
9117 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 arglen = -1;
9119 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009121 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009122 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124
9125 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 if (*fmt != '%') {
9127 if (--rescnt < 0) {
9128 rescnt = fmtcnt + 100;
9129 reslen += rescnt;
9130 if (_PyUnicode_Resize(&result, reslen) < 0)
9131 goto onError;
9132 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9133 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009136 }
9137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 /* Got a format specifier */
9139 int flags = 0;
9140 Py_ssize_t width = -1;
9141 int prec = -1;
9142 Py_UNICODE c = '\0';
9143 Py_UNICODE fill;
9144 int isnumok;
9145 PyObject *v = NULL;
9146 PyObject *temp = NULL;
9147 Py_UNICODE *pbuf;
9148 Py_UNICODE sign;
9149 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009150 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 fmt++;
9153 if (*fmt == '(') {
9154 Py_UNICODE *keystart;
9155 Py_ssize_t keylen;
9156 PyObject *key;
9157 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009158
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 if (dict == NULL) {
9160 PyErr_SetString(PyExc_TypeError,
9161 "format requires a mapping");
9162 goto onError;
9163 }
9164 ++fmt;
9165 --fmtcnt;
9166 keystart = fmt;
9167 /* Skip over balanced parentheses */
9168 while (pcount > 0 && --fmtcnt >= 0) {
9169 if (*fmt == ')')
9170 --pcount;
9171 else if (*fmt == '(')
9172 ++pcount;
9173 fmt++;
9174 }
9175 keylen = fmt - keystart - 1;
9176 if (fmtcnt < 0 || pcount > 0) {
9177 PyErr_SetString(PyExc_ValueError,
9178 "incomplete format key");
9179 goto onError;
9180 }
9181#if 0
9182 /* keys are converted to strings using UTF-8 and
9183 then looked up since Python uses strings to hold
9184 variables names etc. in its namespaces and we
9185 wouldn't want to break common idioms. */
9186 key = PyUnicode_EncodeUTF8(keystart,
9187 keylen,
9188 NULL);
9189#else
9190 key = PyUnicode_FromUnicode(keystart, keylen);
9191#endif
9192 if (key == NULL)
9193 goto onError;
9194 if (args_owned) {
9195 Py_DECREF(args);
9196 args_owned = 0;
9197 }
9198 args = PyObject_GetItem(dict, key);
9199 Py_DECREF(key);
9200 if (args == NULL) {
9201 goto onError;
9202 }
9203 args_owned = 1;
9204 arglen = -1;
9205 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 while (--fmtcnt >= 0) {
9208 switch (c = *fmt++) {
9209 case '-': flags |= F_LJUST; continue;
9210 case '+': flags |= F_SIGN; continue;
9211 case ' ': flags |= F_BLANK; continue;
9212 case '#': flags |= F_ALT; continue;
9213 case '0': flags |= F_ZERO; continue;
9214 }
9215 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 if (c == '*') {
9218 v = getnextarg(args, arglen, &argidx);
9219 if (v == NULL)
9220 goto onError;
9221 if (!PyLong_Check(v)) {
9222 PyErr_SetString(PyExc_TypeError,
9223 "* wants int");
9224 goto onError;
9225 }
9226 width = PyLong_AsLong(v);
9227 if (width == -1 && PyErr_Occurred())
9228 goto onError;
9229 if (width < 0) {
9230 flags |= F_LJUST;
9231 width = -width;
9232 }
9233 if (--fmtcnt >= 0)
9234 c = *fmt++;
9235 }
9236 else if (c >= '0' && c <= '9') {
9237 width = c - '0';
9238 while (--fmtcnt >= 0) {
9239 c = *fmt++;
9240 if (c < '0' || c > '9')
9241 break;
9242 if ((width*10) / 10 != width) {
9243 PyErr_SetString(PyExc_ValueError,
9244 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009245 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 }
9247 width = width*10 + (c - '0');
9248 }
9249 }
9250 if (c == '.') {
9251 prec = 0;
9252 if (--fmtcnt >= 0)
9253 c = *fmt++;
9254 if (c == '*') {
9255 v = getnextarg(args, arglen, &argidx);
9256 if (v == NULL)
9257 goto onError;
9258 if (!PyLong_Check(v)) {
9259 PyErr_SetString(PyExc_TypeError,
9260 "* wants int");
9261 goto onError;
9262 }
9263 prec = PyLong_AsLong(v);
9264 if (prec == -1 && PyErr_Occurred())
9265 goto onError;
9266 if (prec < 0)
9267 prec = 0;
9268 if (--fmtcnt >= 0)
9269 c = *fmt++;
9270 }
9271 else if (c >= '0' && c <= '9') {
9272 prec = c - '0';
9273 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009274 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 if (c < '0' || c > '9')
9276 break;
9277 if ((prec*10) / 10 != prec) {
9278 PyErr_SetString(PyExc_ValueError,
9279 "prec too big");
9280 goto onError;
9281 }
9282 prec = prec*10 + (c - '0');
9283 }
9284 }
9285 } /* prec */
9286 if (fmtcnt >= 0) {
9287 if (c == 'h' || c == 'l' || c == 'L') {
9288 if (--fmtcnt >= 0)
9289 c = *fmt++;
9290 }
9291 }
9292 if (fmtcnt < 0) {
9293 PyErr_SetString(PyExc_ValueError,
9294 "incomplete format");
9295 goto onError;
9296 }
9297 if (c != '%') {
9298 v = getnextarg(args, arglen, &argidx);
9299 if (v == NULL)
9300 goto onError;
9301 }
9302 sign = 0;
9303 fill = ' ';
9304 switch (c) {
9305
9306 case '%':
9307 pbuf = formatbuf;
9308 /* presume that buffer length is at least 1 */
9309 pbuf[0] = '%';
9310 len = 1;
9311 break;
9312
9313 case 's':
9314 case 'r':
9315 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009316 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 temp = v;
9318 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009319 }
9320 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 if (c == 's')
9322 temp = PyObject_Str(v);
9323 else if (c == 'r')
9324 temp = PyObject_Repr(v);
9325 else
9326 temp = PyObject_ASCII(v);
9327 if (temp == NULL)
9328 goto onError;
9329 if (PyUnicode_Check(temp))
9330 /* nothing to do */;
9331 else {
9332 Py_DECREF(temp);
9333 PyErr_SetString(PyExc_TypeError,
9334 "%s argument has non-string str()");
9335 goto onError;
9336 }
9337 }
9338 pbuf = PyUnicode_AS_UNICODE(temp);
9339 len = PyUnicode_GET_SIZE(temp);
9340 if (prec >= 0 && len > prec)
9341 len = prec;
9342 break;
9343
9344 case 'i':
9345 case 'd':
9346 case 'u':
9347 case 'o':
9348 case 'x':
9349 case 'X':
9350 if (c == 'i')
9351 c = 'd';
9352 isnumok = 0;
9353 if (PyNumber_Check(v)) {
9354 PyObject *iobj=NULL;
9355
9356 if (PyLong_Check(v)) {
9357 iobj = v;
9358 Py_INCREF(iobj);
9359 }
9360 else {
9361 iobj = PyNumber_Long(v);
9362 }
9363 if (iobj!=NULL) {
9364 if (PyLong_Check(iobj)) {
9365 isnumok = 1;
9366 temp = formatlong(iobj, flags, prec, c);
9367 Py_DECREF(iobj);
9368 if (!temp)
9369 goto onError;
9370 pbuf = PyUnicode_AS_UNICODE(temp);
9371 len = PyUnicode_GET_SIZE(temp);
9372 sign = 1;
9373 }
9374 else {
9375 Py_DECREF(iobj);
9376 }
9377 }
9378 }
9379 if (!isnumok) {
9380 PyErr_Format(PyExc_TypeError,
9381 "%%%c format: a number is required, "
9382 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9383 goto onError;
9384 }
9385 if (flags & F_ZERO)
9386 fill = '0';
9387 break;
9388
9389 case 'e':
9390 case 'E':
9391 case 'f':
9392 case 'F':
9393 case 'g':
9394 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009395 temp = formatfloat(v, flags, prec, c);
9396 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009398 pbuf = PyUnicode_AS_UNICODE(temp);
9399 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 sign = 1;
9401 if (flags & F_ZERO)
9402 fill = '0';
9403 break;
9404
9405 case 'c':
9406 pbuf = formatbuf;
9407 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9408 if (len < 0)
9409 goto onError;
9410 break;
9411
9412 default:
9413 PyErr_Format(PyExc_ValueError,
9414 "unsupported format character '%c' (0x%x) "
9415 "at index %zd",
9416 (31<=c && c<=126) ? (char)c : '?',
9417 (int)c,
9418 (Py_ssize_t)(fmt - 1 -
9419 PyUnicode_AS_UNICODE(uformat)));
9420 goto onError;
9421 }
9422 if (sign) {
9423 if (*pbuf == '-' || *pbuf == '+') {
9424 sign = *pbuf++;
9425 len--;
9426 }
9427 else if (flags & F_SIGN)
9428 sign = '+';
9429 else if (flags & F_BLANK)
9430 sign = ' ';
9431 else
9432 sign = 0;
9433 }
9434 if (width < len)
9435 width = len;
9436 if (rescnt - (sign != 0) < width) {
9437 reslen -= rescnt;
9438 rescnt = width + fmtcnt + 100;
9439 reslen += rescnt;
9440 if (reslen < 0) {
9441 Py_XDECREF(temp);
9442 PyErr_NoMemory();
9443 goto onError;
9444 }
9445 if (_PyUnicode_Resize(&result, reslen) < 0) {
9446 Py_XDECREF(temp);
9447 goto onError;
9448 }
9449 res = PyUnicode_AS_UNICODE(result)
9450 + reslen - rescnt;
9451 }
9452 if (sign) {
9453 if (fill != ' ')
9454 *res++ = sign;
9455 rescnt--;
9456 if (width > len)
9457 width--;
9458 }
9459 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9460 assert(pbuf[0] == '0');
9461 assert(pbuf[1] == c);
9462 if (fill != ' ') {
9463 *res++ = *pbuf++;
9464 *res++ = *pbuf++;
9465 }
9466 rescnt -= 2;
9467 width -= 2;
9468 if (width < 0)
9469 width = 0;
9470 len -= 2;
9471 }
9472 if (width > len && !(flags & F_LJUST)) {
9473 do {
9474 --rescnt;
9475 *res++ = fill;
9476 } while (--width > len);
9477 }
9478 if (fill == ' ') {
9479 if (sign)
9480 *res++ = sign;
9481 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9482 assert(pbuf[0] == '0');
9483 assert(pbuf[1] == c);
9484 *res++ = *pbuf++;
9485 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009486 }
9487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 Py_UNICODE_COPY(res, pbuf, len);
9489 res += len;
9490 rescnt -= len;
9491 while (--width >= len) {
9492 --rescnt;
9493 *res++ = ' ';
9494 }
9495 if (dict && (argidx < arglen) && c != '%') {
9496 PyErr_SetString(PyExc_TypeError,
9497 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009498 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 goto onError;
9500 }
9501 Py_XDECREF(temp);
9502 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 } /* until end */
9504 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 PyErr_SetString(PyExc_TypeError,
9506 "not all arguments converted during string formatting");
9507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
9509
Thomas Woutersa96affe2006-03-12 00:29:36 +00009510 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 return (PyObject *)result;
9517
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 Py_XDECREF(result);
9520 Py_DECREF(uformat);
9521 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 }
9524 return NULL;
9525}
9526
Jeremy Hylton938ace62002-07-17 16:30:39 +00009527static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009528unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9529
Tim Peters6d6c1a32001-08-02 04:15:00 +00009530static PyObject *
9531unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9532{
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009534 static char *kwlist[] = {"object", "encoding", "errors", 0};
9535 char *encoding = NULL;
9536 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009537
Benjamin Peterson14339b62009-01-31 16:36:08 +00009538 if (type != &PyUnicode_Type)
9539 return unicode_subtype_new(type, args, kwds);
9540 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009542 return NULL;
9543 if (x == NULL)
9544 return (PyObject *)_PyUnicode_New(0);
9545 if (encoding == NULL && errors == NULL)
9546 return PyObject_Str(x);
9547 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009549}
9550
Guido van Rossume023fe02001-08-30 03:12:59 +00009551static PyObject *
9552unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9553{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009554 PyUnicodeObject *tmp, *pnew;
9555 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009556
Benjamin Peterson14339b62009-01-31 16:36:08 +00009557 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9558 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9559 if (tmp == NULL)
9560 return NULL;
9561 assert(PyUnicode_Check(tmp));
9562 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9563 if (pnew == NULL) {
9564 Py_DECREF(tmp);
9565 return NULL;
9566 }
9567 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9568 if (pnew->str == NULL) {
9569 _Py_ForgetReference((PyObject *)pnew);
9570 PyObject_Del(pnew);
9571 Py_DECREF(tmp);
9572 return PyErr_NoMemory();
9573 }
9574 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9575 pnew->length = n;
9576 pnew->hash = tmp->hash;
9577 Py_DECREF(tmp);
9578 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009579}
9580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009581PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009583\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009584Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009585encoding defaults to the current default string encoding.\n\
9586errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009587
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009588static PyObject *unicode_iter(PyObject *seq);
9589
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009591 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009592 "str", /* tp_name */
9593 sizeof(PyUnicodeObject), /* tp_size */
9594 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009596 (destructor)unicode_dealloc, /* tp_dealloc */
9597 0, /* tp_print */
9598 0, /* tp_getattr */
9599 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009600 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009601 unicode_repr, /* tp_repr */
9602 &unicode_as_number, /* tp_as_number */
9603 &unicode_as_sequence, /* tp_as_sequence */
9604 &unicode_as_mapping, /* tp_as_mapping */
9605 (hashfunc) unicode_hash, /* tp_hash*/
9606 0, /* tp_call*/
9607 (reprfunc) unicode_str, /* tp_str */
9608 PyObject_GenericGetAttr, /* tp_getattro */
9609 0, /* tp_setattro */
9610 0, /* tp_as_buffer */
9611 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009613 unicode_doc, /* tp_doc */
9614 0, /* tp_traverse */
9615 0, /* tp_clear */
9616 PyUnicode_RichCompare, /* tp_richcompare */
9617 0, /* tp_weaklistoffset */
9618 unicode_iter, /* tp_iter */
9619 0, /* tp_iternext */
9620 unicode_methods, /* tp_methods */
9621 0, /* tp_members */
9622 0, /* tp_getset */
9623 &PyBaseObject_Type, /* tp_base */
9624 0, /* tp_dict */
9625 0, /* tp_descr_get */
9626 0, /* tp_descr_set */
9627 0, /* tp_dictoffset */
9628 0, /* tp_init */
9629 0, /* tp_alloc */
9630 unicode_new, /* tp_new */
9631 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632};
9633
9634/* Initialize the Unicode implementation */
9635
Thomas Wouters78890102000-07-22 19:25:51 +00009636void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009638 int i;
9639
Thomas Wouters477c8d52006-05-27 19:21:47 +00009640 /* XXX - move this array to unicodectype.c ? */
9641 Py_UNICODE linebreak[] = {
9642 0x000A, /* LINE FEED */
9643 0x000D, /* CARRIAGE RETURN */
9644 0x001C, /* FILE SEPARATOR */
9645 0x001D, /* GROUP SEPARATOR */
9646 0x001E, /* RECORD SEPARATOR */
9647 0x0085, /* NEXT LINE */
9648 0x2028, /* LINE SEPARATOR */
9649 0x2029, /* PARAGRAPH SEPARATOR */
9650 };
9651
Fred Drakee4315f52000-05-09 19:53:39 +00009652 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009653 free_list = NULL;
9654 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009656 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009658
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009659 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009661 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009663
9664 /* initialize the linebreak bloom filter */
9665 bloom_linebreak = make_bloom_mask(
9666 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9667 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009668
9669 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
9672/* Finalize the Unicode implementation */
9673
Christian Heimesa156e092008-02-16 07:38:31 +00009674int
9675PyUnicode_ClearFreeList(void)
9676{
9677 int freelist_size = numfree;
9678 PyUnicodeObject *u;
9679
9680 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 PyUnicodeObject *v = u;
9682 u = *(PyUnicodeObject **)u;
9683 if (v->str)
9684 PyObject_DEL(v->str);
9685 Py_XDECREF(v->defenc);
9686 PyObject_Del(v);
9687 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009688 }
9689 free_list = NULL;
9690 assert(numfree == 0);
9691 return freelist_size;
9692}
9693
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694void
Thomas Wouters78890102000-07-22 19:25:51 +00009695_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009697 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009699 Py_XDECREF(unicode_empty);
9700 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009701
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009702 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 if (unicode_latin1[i]) {
9704 Py_DECREF(unicode_latin1[i]);
9705 unicode_latin1[i] = NULL;
9706 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009707 }
Christian Heimesa156e092008-02-16 07:38:31 +00009708 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009710
Walter Dörwald16807132007-05-25 13:52:07 +00009711void
9712PyUnicode_InternInPlace(PyObject **p)
9713{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9715 PyObject *t;
9716 if (s == NULL || !PyUnicode_Check(s))
9717 Py_FatalError(
9718 "PyUnicode_InternInPlace: unicode strings only please!");
9719 /* If it's a subclass, we don't really know what putting
9720 it in the interned dict might do. */
9721 if (!PyUnicode_CheckExact(s))
9722 return;
9723 if (PyUnicode_CHECK_INTERNED(s))
9724 return;
9725 if (interned == NULL) {
9726 interned = PyDict_New();
9727 if (interned == NULL) {
9728 PyErr_Clear(); /* Don't leave an exception */
9729 return;
9730 }
9731 }
9732 /* It might be that the GetItem call fails even
9733 though the key is present in the dictionary,
9734 namely when this happens during a stack overflow. */
9735 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009738
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 if (t) {
9740 Py_INCREF(t);
9741 Py_DECREF(*p);
9742 *p = t;
9743 return;
9744 }
Walter Dörwald16807132007-05-25 13:52:07 +00009745
Benjamin Peterson14339b62009-01-31 16:36:08 +00009746 PyThreadState_GET()->recursion_critical = 1;
9747 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9748 PyErr_Clear();
9749 PyThreadState_GET()->recursion_critical = 0;
9750 return;
9751 }
9752 PyThreadState_GET()->recursion_critical = 0;
9753 /* The two references in interned are not counted by refcnt.
9754 The deallocator will take care of this */
9755 Py_REFCNT(s) -= 2;
9756 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009757}
9758
9759void
9760PyUnicode_InternImmortal(PyObject **p)
9761{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009762 PyUnicode_InternInPlace(p);
9763 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9764 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9765 Py_INCREF(*p);
9766 }
Walter Dörwald16807132007-05-25 13:52:07 +00009767}
9768
9769PyObject *
9770PyUnicode_InternFromString(const char *cp)
9771{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 PyObject *s = PyUnicode_FromString(cp);
9773 if (s == NULL)
9774 return NULL;
9775 PyUnicode_InternInPlace(&s);
9776 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009777}
9778
9779void _Py_ReleaseInternedUnicodeStrings(void)
9780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781 PyObject *keys;
9782 PyUnicodeObject *s;
9783 Py_ssize_t i, n;
9784 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009785
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 if (interned == NULL || !PyDict_Check(interned))
9787 return;
9788 keys = PyDict_Keys(interned);
9789 if (keys == NULL || !PyList_Check(keys)) {
9790 PyErr_Clear();
9791 return;
9792 }
Walter Dörwald16807132007-05-25 13:52:07 +00009793
Benjamin Peterson14339b62009-01-31 16:36:08 +00009794 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9795 detector, interned unicode strings are not forcibly deallocated;
9796 rather, we give them their stolen references back, and then clear
9797 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009798
Benjamin Peterson14339b62009-01-31 16:36:08 +00009799 n = PyList_GET_SIZE(keys);
9800 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009802 for (i = 0; i < n; i++) {
9803 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9804 switch (s->state) {
9805 case SSTATE_NOT_INTERNED:
9806 /* XXX Shouldn't happen */
9807 break;
9808 case SSTATE_INTERNED_IMMORTAL:
9809 Py_REFCNT(s) += 1;
9810 immortal_size += s->length;
9811 break;
9812 case SSTATE_INTERNED_MORTAL:
9813 Py_REFCNT(s) += 2;
9814 mortal_size += s->length;
9815 break;
9816 default:
9817 Py_FatalError("Inconsistent interned string state.");
9818 }
9819 s->state = SSTATE_NOT_INTERNED;
9820 }
9821 fprintf(stderr, "total size of all interned strings: "
9822 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9823 "mortal/immortal\n", mortal_size, immortal_size);
9824 Py_DECREF(keys);
9825 PyDict_Clear(interned);
9826 Py_DECREF(interned);
9827 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009828}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009829
9830
9831/********************* Unicode Iterator **************************/
9832
9833typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 PyObject_HEAD
9835 Py_ssize_t it_index;
9836 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009837} unicodeiterobject;
9838
9839static void
9840unicodeiter_dealloc(unicodeiterobject *it)
9841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 _PyObject_GC_UNTRACK(it);
9843 Py_XDECREF(it->it_seq);
9844 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009845}
9846
9847static int
9848unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009850 Py_VISIT(it->it_seq);
9851 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009852}
9853
9854static PyObject *
9855unicodeiter_next(unicodeiterobject *it)
9856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 PyUnicodeObject *seq;
9858 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009859
Benjamin Peterson14339b62009-01-31 16:36:08 +00009860 assert(it != NULL);
9861 seq = it->it_seq;
9862 if (seq == NULL)
9863 return NULL;
9864 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009865
Benjamin Peterson14339b62009-01-31 16:36:08 +00009866 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9867 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 if (item != NULL)
9870 ++it->it_index;
9871 return item;
9872 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009873
Benjamin Peterson14339b62009-01-31 16:36:08 +00009874 Py_DECREF(seq);
9875 it->it_seq = NULL;
9876 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009877}
9878
9879static PyObject *
9880unicodeiter_len(unicodeiterobject *it)
9881{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009882 Py_ssize_t len = 0;
9883 if (it->it_seq)
9884 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9885 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009886}
9887
9888PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9889
9890static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009893 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009894};
9895
9896PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009897 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9898 "str_iterator", /* tp_name */
9899 sizeof(unicodeiterobject), /* tp_basicsize */
9900 0, /* tp_itemsize */
9901 /* methods */
9902 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9903 0, /* tp_print */
9904 0, /* tp_getattr */
9905 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009906 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 0, /* tp_repr */
9908 0, /* tp_as_number */
9909 0, /* tp_as_sequence */
9910 0, /* tp_as_mapping */
9911 0, /* tp_hash */
9912 0, /* tp_call */
9913 0, /* tp_str */
9914 PyObject_GenericGetAttr, /* tp_getattro */
9915 0, /* tp_setattro */
9916 0, /* tp_as_buffer */
9917 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9918 0, /* tp_doc */
9919 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9920 0, /* tp_clear */
9921 0, /* tp_richcompare */
9922 0, /* tp_weaklistoffset */
9923 PyObject_SelfIter, /* tp_iter */
9924 (iternextfunc)unicodeiter_next, /* tp_iternext */
9925 unicodeiter_methods, /* tp_methods */
9926 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009927};
9928
9929static PyObject *
9930unicode_iter(PyObject *seq)
9931{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009932 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009933
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 if (!PyUnicode_Check(seq)) {
9935 PyErr_BadInternalCall();
9936 return NULL;
9937 }
9938 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9939 if (it == NULL)
9940 return NULL;
9941 it->it_index = 0;
9942 Py_INCREF(seq);
9943 it->it_seq = (PyUnicodeObject *)seq;
9944 _PyObject_GC_TRACK(it);
9945 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009946}
9947
Martin v. Löwis5b222132007-06-10 09:51:05 +00009948size_t
9949Py_UNICODE_strlen(const Py_UNICODE *u)
9950{
9951 int res = 0;
9952 while(*u++)
9953 res++;
9954 return res;
9955}
9956
9957Py_UNICODE*
9958Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9959{
9960 Py_UNICODE *u = s1;
9961 while ((*u++ = *s2++));
9962 return s1;
9963}
9964
9965Py_UNICODE*
9966Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9967{
9968 Py_UNICODE *u = s1;
9969 while ((*u++ = *s2++))
9970 if (n-- == 0)
9971 break;
9972 return s1;
9973}
9974
9975int
9976Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9977{
9978 while (*s1 && *s2 && *s1 == *s2)
9979 s1++, s2++;
9980 if (*s1 && *s2)
9981 return (*s1 < *s2) ? -1 : +1;
9982 if (*s1)
9983 return 1;
9984 if (*s2)
9985 return -1;
9986 return 0;
9987}
9988
Victor Stinneref8d95c2010-08-16 22:03:11 +00009989int
9990Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9991{
9992 register Py_UNICODE u1, u2;
9993 for (; n != 0; n--) {
9994 u1 = *s1;
9995 u2 = *s2;
9996 if (u1 != u2)
9997 return (u1 < u2) ? -1 : +1;
9998 if (u1 == '\0')
9999 return 0;
10000 s1++;
10001 s2++;
10002 }
10003 return 0;
10004}
10005
Martin v. Löwis5b222132007-06-10 09:51:05 +000010006Py_UNICODE*
10007Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10008{
10009 const Py_UNICODE *p;
10010 for (p = s; *p; p++)
10011 if (*p == c)
10012 return (Py_UNICODE*)p;
10013 return NULL;
10014}
10015
Victor Stinner331ea922010-08-10 16:37:20 +000010016Py_UNICODE*
10017Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10018{
10019 const Py_UNICODE *p;
10020 p = s + Py_UNICODE_strlen(s);
10021 while (p != s) {
10022 p--;
10023 if (*p == c)
10024 return (Py_UNICODE*)p;
10025 }
10026 return NULL;
10027}
10028
Martin v. Löwis5b222132007-06-10 09:51:05 +000010029
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010030#ifdef __cplusplus
10031}
10032#endif