blob: 7c9b8827389f4efbba58e247ec0f4ad48eec6bfb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000310 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000313
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 return 0;
315}
316
317/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000318 Ux0000 terminated; some code (e.g. new_identifier)
319 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320
321 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
324*/
325
326static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000327PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328{
329 register PyUnicodeObject *unicode;
330
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 if (length == 0 && unicode_empty != NULL) {
333 Py_INCREF(unicode_empty);
334 return unicode_empty;
335 }
336
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000337 /* Ensure we won't overflow the size. */
338 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
339 return (PyUnicodeObject *)PyErr_NoMemory();
340 }
341
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000343 if (free_list) {
344 unicode = free_list;
345 free_list = *(PyUnicodeObject **)unicode;
346 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 if (unicode->str) {
348 /* Keep-Alive optimization: we only upsize the buffer,
349 never downsize it. */
350 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000351 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 PyObject_DEL(unicode->str);
353 unicode->str = NULL;
354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000359 }
360 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (unicode == NULL)
366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
368 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
370
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000371 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 PyErr_NoMemory();
373 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000374 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000376 * the caller fails before initializing str -- unicode_resize()
377 * reads str[0], and the Keep-Alive optimization can keep memory
378 * allocated for str alive across a call to unicode_dealloc(unicode).
379 * We don't want unicode_resize to read uninitialized memory in
380 * that case.
381 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000382 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000386 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000387 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000391 /* XXX UNREF/NEWREF interface should be more symmetrical */
392 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000393 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000394 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396}
397
398static
Guido van Rossum9475a232001-10-05 20:51:39 +0000399void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
Walter Dörwald16807132007-05-25 13:52:07 +0000401 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000402 case SSTATE_NOT_INTERNED:
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_MORTAL:
406 /* revive dead object temporarily for DelItem */
407 Py_REFCNT(unicode) = 3;
408 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
409 Py_FatalError(
410 "deletion of interned string failed");
411 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000412
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 case SSTATE_INTERNED_IMMORTAL:
414 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000415
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 default:
417 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000418 }
419
Guido van Rossum604ddf82001-12-06 20:03:56 +0000420 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000422 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000423 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
424 PyObject_DEL(unicode->str);
425 unicode->str = NULL;
426 unicode->length = 0;
427 }
428 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000429 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000430 }
431 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000432 *(PyUnicodeObject **)unicode = free_list;
433 free_list = unicode;
434 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyObject_DEL(unicode->str);
438 Py_XDECREF(unicode->defenc);
439 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 }
441}
442
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443static
444int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445{
446 register PyUnicodeObject *v;
447
448 /* Argument checks */
449 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 PyErr_BadInternalCall();
451 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000453 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000454 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 PyErr_BadInternalCall();
456 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
458
459 /* Resizing unicode_empty and single character objects is not
460 possible since these are being shared. We simply return a fresh
461 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000462 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 (v == unicode_empty || v->length == 1)) {
464 PyUnicodeObject *w = _PyUnicode_New(length);
465 if (w == NULL)
466 return -1;
467 Py_UNICODE_COPY(w->str, v->str,
468 length < v->length ? length : v->length);
469 Py_DECREF(*unicode);
470 *unicode = w;
471 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
473
474 /* Note that we don't have to modify *unicode for unshared Unicode
475 objects, since we can modify them in-place. */
476 return unicode_resize(v, length);
477}
478
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000479int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
480{
481 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
482}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486{
487 PyUnicodeObject *unicode;
488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000489 /* If the Unicode data is known at construction time, we can apply
490 some optimizations which share commonly used objects. */
491 if (u != NULL) {
492
Benjamin Peterson29060642009-01-31 22:14:21 +0000493 /* Optimization for empty strings */
494 if (size == 0 && unicode_empty != NULL) {
495 Py_INCREF(unicode_empty);
496 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000497 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000498
499 /* Single character Unicode objects in the Latin-1 range are
500 shared when using this constructor */
501 if (size == 1 && *u < 256) {
502 unicode = unicode_latin1[*u];
503 if (!unicode) {
504 unicode = _PyUnicode_New(1);
505 if (!unicode)
506 return NULL;
507 unicode->str[0] = *u;
508 unicode_latin1[*u] = unicode;
509 }
510 Py_INCREF(unicode);
511 return (PyObject *)unicode;
512 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518
519 /* Copy the Unicode data into the new object */
520 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522
523 return (PyObject *)unicode;
524}
525
Walter Dörwaldd2034312007-05-18 16:29:38 +0000526PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527{
528 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 if (size < 0) {
531 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 return NULL;
534 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000535
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000537 some optimizations which share commonly used objects.
538 Also, this means the input must be UTF-8, so fall back to the
539 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 if (u != NULL) {
541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542 /* Optimization for empty strings */
543 if (size == 0 && unicode_empty != NULL) {
544 Py_INCREF(unicode_empty);
545 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000546 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000547
548 /* Single characters are shared when using this constructor.
549 Restrict to ASCII, since the input must be UTF-8. */
550 if (size == 1 && Py_CHARMASK(*u) < 128) {
551 unicode = unicode_latin1[Py_CHARMASK(*u)];
552 if (!unicode) {
553 unicode = _PyUnicode_New(1);
554 if (!unicode)
555 return NULL;
556 unicode->str[0] = Py_CHARMASK(*u);
557 unicode_latin1[Py_CHARMASK(*u)] = unicode;
558 }
559 Py_INCREF(unicode);
560 return (PyObject *)unicode;
561 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000562
563 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 }
565
Walter Dörwald55507312007-05-18 13:12:10 +0000566 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 if (!unicode)
568 return NULL;
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570 return (PyObject *)unicode;
571}
572
Walter Dörwaldd2034312007-05-18 16:29:38 +0000573PyObject *PyUnicode_FromString(const char *u)
574{
575 size_t size = strlen(u);
576 if (size > PY_SSIZE_T_MAX) {
577 PyErr_SetString(PyExc_OverflowError, "input too long");
578 return NULL;
579 }
580
581 return PyUnicode_FromStringAndSize(u, size);
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584#ifdef HAVE_WCHAR_H
585
Mark Dickinson081dfee2009-03-18 14:47:41 +0000586#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
587# define CONVERT_WCHAR_TO_SURROGATES
588#endif
589
590#ifdef CONVERT_WCHAR_TO_SURROGATES
591
592/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
593 to convert from UTF32 to UTF16. */
594
595PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
596 Py_ssize_t size)
597{
598 PyUnicodeObject *unicode;
599 register Py_ssize_t i;
600 Py_ssize_t alloc;
601 const wchar_t *orig_w;
602
603 if (w == NULL) {
604 if (size == 0)
605 return PyUnicode_FromStringAndSize(NULL, 0);
606 PyErr_BadInternalCall();
607 return NULL;
608 }
609
610 if (size == -1) {
611 size = wcslen(w);
612 }
613
614 alloc = size;
615 orig_w = w;
616 for (i = size; i > 0; i--) {
617 if (*w > 0xFFFF)
618 alloc++;
619 w++;
620 }
621 w = orig_w;
622 unicode = _PyUnicode_New(alloc);
623 if (!unicode)
624 return NULL;
625
626 /* Copy the wchar_t data into the new object */
627 {
628 register Py_UNICODE *u;
629 u = PyUnicode_AS_UNICODE(unicode);
630 for (i = size; i > 0; i--) {
631 if (*w > 0xFFFF) {
632 wchar_t ordinal = *w++;
633 ordinal -= 0x10000;
634 *u++ = 0xD800 | (ordinal >> 10);
635 *u++ = 0xDC00 | (ordinal & 0x3FF);
636 }
637 else
638 *u++ = *w++;
639 }
640 }
641 return (PyObject *)unicode;
642}
643
644#else
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000647 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648{
649 PyUnicodeObject *unicode;
650
651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == 0)
653 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 PyErr_BadInternalCall();
655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 }
657
Martin v. Löwis790465f2008-04-05 20:41:37 +0000658 if (size == -1) {
659 size = wcslen(w);
660 }
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = _PyUnicode_New(size);
663 if (!unicode)
664 return NULL;
665
666 /* Copy the wchar_t data into the new object */
667#ifdef HAVE_USABLE_WCHAR_T
668 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000669#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 register Py_UNICODE *u;
672 register Py_ssize_t i;
673 u = PyUnicode_AS_UNICODE(unicode);
674 for (i = size; i > 0; i--)
675 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000676 }
677#endif
678
679 return (PyObject *)unicode;
680}
681
Mark Dickinson081dfee2009-03-18 14:47:41 +0000682#endif /* CONVERT_WCHAR_TO_SURROGATES */
683
684#undef CONVERT_WCHAR_TO_SURROGATES
685
Walter Dörwald346737f2007-05-31 10:44:43 +0000686static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000687makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
688 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000689{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000690 *fmt++ = '%';
691 if (width) {
692 if (zeropad)
693 *fmt++ = '0';
694 fmt += sprintf(fmt, "%d", width);
695 }
696 if (precision)
697 fmt += sprintf(fmt, ".%d", precision);
698 if (longflag)
699 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000700 else if (longlongflag) {
701 /* longlongflag should only ever be nonzero on machines with
702 HAVE_LONG_LONG defined */
703#ifdef HAVE_LONG_LONG
704 char *f = PY_FORMAT_LONG_LONG;
705 while (*f)
706 *fmt++ = *f++;
707#else
708 /* we shouldn't ever get here */
709 assert(0);
710 *fmt++ = 'l';
711#endif
712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 else if (size_tflag) {
714 char *f = PY_FORMAT_SIZE_T;
715 while (*f)
716 *fmt++ = *f++;
717 }
718 *fmt++ = c;
719 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000720}
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
723
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000724/* size of fixed-size buffer for formatting single arguments */
725#define ITEM_BUFFER_LEN 21
726/* maximum number of characters required for output of %ld. 21 characters
727 allows for 64-bit integers (in decimal) and an optional sign. */
728#define MAX_LONG_CHARS 21
729/* maximum number of characters required for output of %lld.
730 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
731 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
732#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
733
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734PyObject *
735PyUnicode_FromFormatV(const char *format, va_list vargs)
736{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000737 va_list count;
738 Py_ssize_t callcount = 0;
739 PyObject **callresults = NULL;
740 PyObject **callresult = NULL;
741 Py_ssize_t n = 0;
742 int width = 0;
743 int precision = 0;
744 int zeropad;
745 const char* f;
746 Py_UNICODE *s;
747 PyObject *string;
748 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 /* use abuffer instead of buffer, if we need more space
751 * (which can happen if there's a format specifier with width). */
752 char *abuffer = NULL;
753 char *realbuffer;
754 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000755 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000758 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 /* step 1: count the number of %S/%R/%A/%s format specifications
760 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
761 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
762 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000763 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000764 if (*f == '%') {
765 if (*(f+1)=='%')
766 continue;
767 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
768 ++callcount;
769 while (ISDIGIT((unsigned)*f))
770 width = (width*10) + *f++ - '0';
771 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
772 ;
773 if (*f == 's')
774 ++callcount;
775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 }
777 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000778 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000779 if (callcount) {
780 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
781 if (!callresults) {
782 PyErr_NoMemory();
783 return NULL;
784 }
785 callresult = callresults;
786 }
787 /* step 3: figure out how large a buffer we need */
788 for (f = format; *f; f++) {
789 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000790#ifdef HAVE_LONG_LONG
791 int longlongflag = 0;
792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 const char* p = f;
794 width = 0;
795 while (ISDIGIT((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
798 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
801 * they don't affect the amount of space we reserve.
802 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000803 if (*f == 'l') {
804 if (f[1] == 'd' || f[1] == 'u') {
805 ++f;
806 }
807#ifdef HAVE_LONG_LONG
808 else if (f[1] == 'l' &&
809 (f[2] == 'd' || f[2] == 'u')) {
810 longlongflag = 1;
811 f += 2;
812 }
813#endif
814 }
815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000816 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Benjamin Peterson14339b62009-01-31 16:36:08 +0000819 switch (*f) {
820 case 'c':
821 (void)va_arg(count, int);
822 /* fall through... */
823 case '%':
824 n++;
825 break;
826 case 'd': case 'u': case 'i': case 'x':
827 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000828#ifdef HAVE_LONG_LONG
829 if (longlongflag) {
830 if (width < MAX_LONG_LONG_CHARS)
831 width = MAX_LONG_LONG_CHARS;
832 }
833 else
834#endif
835 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
836 including sign. Decimal takes the most space. This
837 isn't enough for octal. If a width is specified we
838 need more (which we allocate later). */
839 if (width < MAX_LONG_CHARS)
840 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000842 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 if (abuffersize < width)
844 abuffersize = width;
845 break;
846 case 's':
847 {
848 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000849 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000850 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
851 if (!str)
852 goto fail;
853 n += PyUnicode_GET_SIZE(str);
854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 break;
857 }
858 case 'U':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 assert(obj && PyUnicode_Check(obj));
862 n += PyUnicode_GET_SIZE(obj);
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(count, PyObject *);
868 const char *str = va_arg(count, const char *);
869 assert(obj || str);
870 assert(!obj || PyUnicode_Check(obj));
871 if (obj)
872 n += PyUnicode_GET_SIZE(obj);
873 else
874 n += strlen(str);
875 break;
876 }
877 case 'S':
878 {
879 PyObject *obj = va_arg(count, PyObject *);
880 PyObject *str;
881 assert(obj);
882 str = PyObject_Str(obj);
883 if (!str)
884 goto fail;
885 n += PyUnicode_GET_SIZE(str);
886 /* Remember the str and switch to the next slot */
887 *callresult++ = str;
888 break;
889 }
890 case 'R':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *repr;
894 assert(obj);
895 repr = PyObject_Repr(obj);
896 if (!repr)
897 goto fail;
898 n += PyUnicode_GET_SIZE(repr);
899 /* Remember the repr and switch to the next slot */
900 *callresult++ = repr;
901 break;
902 }
903 case 'A':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *ascii;
907 assert(obj);
908 ascii = PyObject_ASCII(obj);
909 if (!ascii)
910 goto fail;
911 n += PyUnicode_GET_SIZE(ascii);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = ascii;
914 break;
915 }
916 case 'p':
917 (void) va_arg(count, int);
918 /* maximum 64-bit pointer representation:
919 * 0xffffffffffffffff
920 * so 19 characters is enough.
921 * XXX I count 18 -- what's the extra for?
922 */
923 n += 19;
924 break;
925 default:
926 /* if we stumble upon an unknown
927 formatting code, copy the rest of
928 the format string to the output
929 string. (we cannot just skip the
930 code, since there's no way to know
931 what's in the argument list) */
932 n += strlen(p);
933 goto expand;
934 }
935 } else
936 n++;
937 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000939 if (abuffersize > ITEM_BUFFER_LEN) {
940 /* add 1 for sprintf's trailing null byte */
941 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000942 if (!abuffer) {
943 PyErr_NoMemory();
944 goto fail;
945 }
946 realbuffer = abuffer;
947 }
948 else
949 realbuffer = buffer;
950 /* step 4: fill the buffer */
951 /* Since we've analyzed how much space we need for the worst case,
952 we don't have to resize the string.
953 There can be no errors beyond this point. */
954 string = PyUnicode_FromUnicode(NULL, n);
955 if (!string)
956 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 s = PyUnicode_AS_UNICODE(string);
959 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 for (f = format; *f; f++) {
962 if (*f == '%') {
963 const char* p = f++;
964 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000965 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 int size_tflag = 0;
967 zeropad = (*f == '0');
968 /* parse the width.precision part */
969 width = 0;
970 while (ISDIGIT((unsigned)*f))
971 width = (width*10) + *f++ - '0';
972 precision = 0;
973 if (*f == '.') {
974 f++;
975 while (ISDIGIT((unsigned)*f))
976 precision = (precision*10) + *f++ - '0';
977 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 /* Handle %ld, %lu, %lld and %llu. */
979 if (*f == 'l') {
980 if (f[1] == 'd' || f[1] == 'u') {
981 longflag = 1;
982 ++f;
983 }
984#ifdef HAVE_LONG_LONG
985 else if (f[1] == 'l' &&
986 (f[2] == 'd' || f[2] == 'u')) {
987 longlongflag = 1;
988 f += 2;
989 }
990#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000991 }
992 /* handle the size_t flag. */
993 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
994 size_tflag = 1;
995 ++f;
996 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000997
Benjamin Peterson14339b62009-01-31 16:36:08 +0000998 switch (*f) {
999 case 'c':
1000 *s++ = va_arg(vargs, int);
1001 break;
1002 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1004 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 if (longflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007#ifdef HAVE_LONG_LONG
1008 else if (longlongflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 else if (size_tflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1013 else
1014 sprintf(realbuffer, fmt, va_arg(vargs, int));
1015 appendstring(realbuffer);
1016 break;
1017 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001018 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1019 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 if (longflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022#ifdef HAVE_LONG_LONG
1023 else if (longlongflag)
1024 sprintf(realbuffer, fmt, va_arg(vargs,
1025 unsigned PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001039 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001040 sprintf(realbuffer, fmt, va_arg(vargs, int));
1041 appendstring(realbuffer);
1042 break;
1043 case 's':
1044 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001045 /* unused, since we already have the result */
1046 (void) va_arg(vargs, char *);
1047 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1048 PyUnicode_GET_SIZE(*callresult));
1049 s += PyUnicode_GET_SIZE(*callresult);
1050 /* We're done with the unicode()/repr() => forget it */
1051 Py_DECREF(*callresult);
1052 /* switch to next unicode()/repr() result */
1053 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 break;
1055 }
1056 case 'U':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1060 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1061 s += size;
1062 break;
1063 }
1064 case 'V':
1065 {
1066 PyObject *obj = va_arg(vargs, PyObject *);
1067 const char *str = va_arg(vargs, const char *);
1068 if (obj) {
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 } else {
1073 appendstring(str);
1074 }
1075 break;
1076 }
1077 case 'S':
1078 case 'R':
1079 {
1080 Py_UNICODE *ucopy;
1081 Py_ssize_t usize;
1082 Py_ssize_t upos;
1083 /* unused, since we already have the result */
1084 (void) va_arg(vargs, PyObject *);
1085 ucopy = PyUnicode_AS_UNICODE(*callresult);
1086 usize = PyUnicode_GET_SIZE(*callresult);
1087 for (upos = 0; upos<usize;)
1088 *s++ = ucopy[upos++];
1089 /* We're done with the unicode()/repr() => forget it */
1090 Py_DECREF(*callresult);
1091 /* switch to next unicode()/repr() result */
1092 ++callresult;
1093 break;
1094 }
1095 case 'p':
1096 sprintf(buffer, "%p", va_arg(vargs, void*));
1097 /* %p is ill-defined: ensure leading 0x. */
1098 if (buffer[1] == 'X')
1099 buffer[1] = 'x';
1100 else if (buffer[1] != 'x') {
1101 memmove(buffer+2, buffer, strlen(buffer)+1);
1102 buffer[0] = '0';
1103 buffer[1] = 'x';
1104 }
1105 appendstring(buffer);
1106 break;
1107 case '%':
1108 *s++ = '%';
1109 break;
1110 default:
1111 appendstring(p);
1112 goto end;
1113 }
1114 } else
1115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 wchar_t *w,
1159 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160{
1161 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 PyErr_BadInternalCall();
1163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001165
1166 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170#ifdef HAVE_USABLE_WCHAR_T
1171 memcpy(w, unicode->str, size * sizeof(wchar_t));
1172#else
1173 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 register Py_UNICODE *u;
1175 register Py_ssize_t i;
1176 u = PyUnicode_AS_UNICODE(unicode);
1177 for (i = size; i > 0; i--)
1178 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180#endif
1181
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001182 if (size > PyUnicode_GET_SIZE(unicode))
1183 return PyUnicode_GET_SIZE(unicode);
1184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186}
1187
1188#endif
1189
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190PyObject *PyUnicode_FromOrdinal(int ordinal)
1191{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001192 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 PyErr_SetString(PyExc_ValueError,
1196 "chr() arg not in range(0x110000)");
1197 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001199
1200#ifndef Py_UNICODE_WIDE
1201 if (ordinal > 0xffff) {
1202 ordinal -= 0x10000;
1203 s[0] = 0xD800 | (ordinal >> 10);
1204 s[1] = 0xDC00 | (ordinal & 0x3FF);
1205 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001206 }
1207#endif
1208
Hye-Shik Chang40574832004-04-06 07:24:51 +00001209 s[0] = (Py_UNICODE)ordinal;
1210 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_FromObject(register PyObject *obj)
1214{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001215 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 Py_INCREF(obj);
1219 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001220 }
1221 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 /* For a Unicode subtype that's not a Unicode object,
1223 return a true Unicode object with the same data. */
1224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1225 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001227 PyErr_Format(PyExc_TypeError,
1228 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001229 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001230 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231}
1232
1233PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 const char *encoding,
1235 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001237 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001238 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001239 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001240
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001242 PyErr_BadInternalCall();
1243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001245
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001246 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001247 PyErr_SetString(PyExc_TypeError,
1248 "decoding str is not supported");
1249 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001250 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001251
1252 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001253 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001254 s = PyBytes_AS_STRING(obj);
1255 len = PyBytes_GET_SIZE(obj);
1256 }
1257 else if (PyByteArray_Check(obj)) {
1258 s = PyByteArray_AS_STRING(obj);
1259 len = PyByteArray_GET_SIZE(obj);
1260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 /* Overwrite the error message with something more useful in
1263 case of a TypeError. */
1264 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001265 PyErr_Format(PyExc_TypeError,
Georg Brandl952867a2010-06-27 10:17:12 +00001266 "coercing to str: need bytes, bytearray or char buffer, "
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 "%.80s found",
1268 Py_TYPE(obj)->tp_name);
1269 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001270 }
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001272 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 Py_INCREF(unicode_empty);
1275 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Tim Petersced69f82003-09-16 20:30:58 +00001277 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001279
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001280 return v;
1281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284}
1285
Victor Stinner600d3be2010-06-10 12:00:55 +00001286/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001287 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1288 1 on success. */
1289static int
1290normalize_encoding(const char *encoding,
1291 char *lower,
1292 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001294 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001295 char *l;
1296 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001297
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001298 e = encoding;
1299 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001300 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001301 while (*e) {
1302 if (l == l_end)
1303 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 if (ISUPPER(*e)) {
1305 *l++ = TOLOWER(*e++);
1306 }
1307 else if (*e == '_') {
1308 *l++ = '-';
1309 e++;
1310 }
1311 else {
1312 *l++ = *e++;
1313 }
1314 }
1315 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001316 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001317}
1318
1319PyObject *PyUnicode_Decode(const char *s,
1320 Py_ssize_t size,
1321 const char *encoding,
1322 const char *errors)
1323{
1324 PyObject *buffer = NULL, *unicode;
1325 Py_buffer info;
1326 char lower[11]; /* Enough for any encoding shortcut */
1327
1328 if (encoding == NULL)
1329 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001330
1331 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001332 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1333 if (strcmp(lower, "utf-8") == 0)
1334 return PyUnicode_DecodeUTF8(s, size, errors);
1335 else if ((strcmp(lower, "latin-1") == 0) ||
1336 (strcmp(lower, "iso-8859-1") == 0))
1337 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001339 else if (strcmp(lower, "mbcs") == 0)
1340 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001341#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001342 else if (strcmp(lower, "ascii") == 0)
1343 return PyUnicode_DecodeASCII(s, size, errors);
1344 else if (strcmp(lower, "utf-16") == 0)
1345 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1346 else if (strcmp(lower, "utf-32") == 0)
1347 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
1350 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001351 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001352 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001353 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001354 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 if (buffer == NULL)
1356 goto onError;
1357 unicode = PyCodec_Decode(buffer, encoding, errors);
1358 if (unicode == NULL)
1359 goto onError;
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001362 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 Py_DECREF(unicode);
1365 goto onError;
1366 }
1367 Py_DECREF(buffer);
1368 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001369
Benjamin Peterson29060642009-01-31 22:14:21 +00001370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 Py_XDECREF(buffer);
1372 return NULL;
1373}
1374
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1376 const char *encoding,
1377 const char *errors)
1378{
1379 PyObject *v;
1380
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385
1386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001388
1389 /* Decode via the codec registry */
1390 v = PyCodec_Decode(unicode, encoding, errors);
1391 if (v == NULL)
1392 goto onError;
1393 return v;
1394
Benjamin Peterson29060642009-01-31 22:14:21 +00001395 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001396 return NULL;
1397}
1398
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001411 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001412
1413 /* Decode via the codec registry */
1414 v = PyCodec_Decode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
1424 return v;
1425
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001427 return NULL;
1428}
1429
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 Py_ssize_t size,
1432 const char *encoding,
1433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434{
1435 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001436
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 unicode = PyUnicode_FromUnicode(s, size);
1438 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1441 Py_DECREF(unicode);
1442 return v;
1443}
1444
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001445PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1446 const char *encoding,
1447 const char *errors)
1448{
1449 PyObject *v;
1450
1451 if (!PyUnicode_Check(unicode)) {
1452 PyErr_BadArgument();
1453 goto onError;
1454 }
1455
1456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001458
1459 /* Encode via the codec registry */
1460 v = PyCodec_Encode(unicode, encoding, errors);
1461 if (v == NULL)
1462 goto onError;
1463 return v;
1464
Benjamin Peterson29060642009-01-31 22:14:21 +00001465 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001466 return NULL;
1467}
1468
Victor Stinnerae6265f2010-05-15 16:27:27 +00001469PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1470{
Victor Stinner313a1202010-06-11 23:56:51 +00001471 if (Py_FileSystemDefaultEncoding) {
1472#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1473 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1474 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode),
1476 NULL);
1477#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001478 return PyUnicode_AsEncodedString(unicode,
1479 Py_FileSystemDefaultEncoding,
1480 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001481 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001482 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1483 PyUnicode_GET_SIZE(unicode),
1484 "surrogateescape");
1485}
1486
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1488 const char *encoding,
1489 const char *errors)
1490{
1491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001493
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 if (!PyUnicode_Check(unicode)) {
1495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 }
Fred Drakee4315f52000-05-09 19:53:39 +00001498
Tim Petersced69f82003-09-16 20:30:58 +00001499 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001501
1502 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001503 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1504 if (strcmp(lower, "utf-8") == 0)
1505 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1506 PyUnicode_GET_SIZE(unicode),
1507 errors);
1508 else if ((strcmp(lower, "latin-1") == 0) ||
1509 (strcmp(lower, "iso-8859-1") == 0))
1510 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1511 PyUnicode_GET_SIZE(unicode),
1512 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001513#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001514 else if (strcmp(lower, "mbcs") == 0)
1515 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001518#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001519 else if (strcmp(lower, "ascii") == 0)
1520 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
1523 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001524 /* During bootstrap, we may need to find the encodings
1525 package, to load the file system encoding, and require the
1526 file system encoding in order to load the encodings
1527 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001528
Victor Stinner59e62db2010-05-15 13:14:32 +00001529 Break out of this dependency by assuming that the path to
1530 the encodings module is ASCII-only. XXX could try wcstombs
1531 instead, if the file system encoding is the locale's
1532 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001533 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1535 !PyThreadState_GET()->interp->codecs_initialized)
1536 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1537 PyUnicode_GET_SIZE(unicode),
1538 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539
1540 /* Encode via the codec registry */
1541 v = PyCodec_Encode(unicode, encoding, errors);
1542 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001543 return NULL;
1544
1545 /* The normal path */
1546 if (PyBytes_Check(v))
1547 return v;
1548
1549 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001550 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001551 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001552 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001553
1554 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1555 "encoder %s returned bytearray instead of bytes",
1556 encoding);
1557 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001558 Py_DECREF(v);
1559 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001561
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001562 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1563 Py_DECREF(v);
1564 return b;
1565 }
1566
1567 PyErr_Format(PyExc_TypeError,
1568 "encoder did not return a bytes object (type=%.400s)",
1569 Py_TYPE(v)->tp_name);
1570 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001571 return NULL;
1572}
1573
1574PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1575 const char *encoding,
1576 const char *errors)
1577{
1578 PyObject *v;
1579
1580 if (!PyUnicode_Check(unicode)) {
1581 PyErr_BadArgument();
1582 goto onError;
1583 }
1584
1585 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001587
1588 /* Encode via the codec registry */
1589 v = PyCodec_Encode(unicode, encoding, errors);
1590 if (v == NULL)
1591 goto onError;
1592 if (!PyUnicode_Check(v)) {
1593 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001594 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001595 Py_TYPE(v)->tp_name);
1596 Py_DECREF(v);
1597 goto onError;
1598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001600
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 return NULL;
1603}
1604
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001605PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001606 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001607{
1608 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001609 if (v)
1610 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001611 if (errors != NULL)
1612 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001613 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001614 PyUnicode_GET_SIZE(unicode),
1615 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001616 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001617 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001618 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001619 return v;
1620}
1621
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001622PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001623PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001624 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001625 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1626}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001627
Christian Heimes5894ba72007-11-04 11:43:14 +00001628PyObject*
1629PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1630{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001631 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1632 can be undefined. If it is case, decode using UTF-8. The following assumes
1633 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1634 bootstrapping process where the codecs aren't ready yet.
1635 */
1636 if (Py_FileSystemDefaultEncoding) {
1637#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001638 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001639 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001640 }
1641#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001642 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001643 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001644 }
1645#endif
1646 return PyUnicode_Decode(s, size,
1647 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001648 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001651 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001652 }
1653}
1654
Martin v. Löwis011e8422009-05-05 04:43:17 +00001655/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001656 system encoding. The addr param must be a PyObject**.
1657 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001658
1659int
1660PyUnicode_FSConverter(PyObject* arg, void* addr)
1661{
1662 PyObject *output = NULL;
1663 Py_ssize_t size;
1664 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001665 if (arg == NULL) {
1666 Py_DECREF(*(PyObject**)addr);
1667 return 1;
1668 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001669 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001670 output = arg;
1671 Py_INCREF(output);
1672 }
1673 else {
1674 arg = PyUnicode_FromObject(arg);
1675 if (!arg)
1676 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001677 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001678 Py_DECREF(arg);
1679 if (!output)
1680 return 0;
1681 if (!PyBytes_Check(output)) {
1682 Py_DECREF(output);
1683 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1684 return 0;
1685 }
1686 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001687 size = PyBytes_GET_SIZE(output);
1688 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001689 if (size != strlen(data)) {
1690 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1691 Py_DECREF(output);
1692 return 0;
1693 }
1694 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001695 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001696}
1697
1698
Martin v. Löwis5b222132007-06-10 09:51:05 +00001699char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001700_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001701{
Christian Heimesf3863112007-11-22 07:46:41 +00001702 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001703 if (!PyUnicode_Check(unicode)) {
1704 PyErr_BadArgument();
1705 return NULL;
1706 }
Christian Heimesf3863112007-11-22 07:46:41 +00001707 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1708 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001709 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001710 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001711 *psize = PyBytes_GET_SIZE(bytes);
1712 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001713}
1714
1715char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001716_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001717{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001718 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001719}
1720
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1722{
1723 if (!PyUnicode_Check(unicode)) {
1724 PyErr_BadArgument();
1725 goto onError;
1726 }
1727 return PyUnicode_AS_UNICODE(unicode);
1728
Benjamin Peterson29060642009-01-31 22:14:21 +00001729 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 return NULL;
1731}
1732
Martin v. Löwis18e16552006-02-15 17:27:45 +00001733Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734{
1735 if (!PyUnicode_Check(unicode)) {
1736 PyErr_BadArgument();
1737 goto onError;
1738 }
1739 return PyUnicode_GET_SIZE(unicode);
1740
Benjamin Peterson29060642009-01-31 22:14:21 +00001741 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 return -1;
1743}
1744
Thomas Wouters78890102000-07-22 19:25:51 +00001745const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001746{
1747 return unicode_default_encoding;
1748}
1749
1750int PyUnicode_SetDefaultEncoding(const char *encoding)
1751{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001752 if (strcmp(encoding, unicode_default_encoding) != 0) {
1753 PyErr_Format(PyExc_ValueError,
1754 "Can only set default encoding to %s",
1755 unicode_default_encoding);
1756 return -1;
1757 }
Fred Drakee4315f52000-05-09 19:53:39 +00001758 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001759}
1760
Victor Stinner554f3f02010-06-16 23:33:54 +00001761/* create or adjust a UnicodeDecodeError */
1762static void
1763make_decode_exception(PyObject **exceptionObject,
1764 const char *encoding,
1765 const char *input, Py_ssize_t length,
1766 Py_ssize_t startpos, Py_ssize_t endpos,
1767 const char *reason)
1768{
1769 if (*exceptionObject == NULL) {
1770 *exceptionObject = PyUnicodeDecodeError_Create(
1771 encoding, input, length, startpos, endpos, reason);
1772 }
1773 else {
1774 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1775 goto onError;
1776 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1777 goto onError;
1778 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1779 goto onError;
1780 }
1781 return;
1782
1783onError:
1784 Py_DECREF(*exceptionObject);
1785 *exceptionObject = NULL;
1786}
1787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788/* error handling callback helper:
1789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 and adjust various state variables.
1792 return 0 on success, -1 on error
1793*/
1794
1795static
1796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001797 const char *encoding, const char *reason,
1798 const char **input, const char **inend, Py_ssize_t *startinpos,
1799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1800 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001802 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803
1804 PyObject *restuple = NULL;
1805 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001807 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001808 Py_ssize_t requiredsize;
1809 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001811 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001812 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 int res = -1;
1814
1815 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 *errorHandler = PyCodec_LookupError(errors);
1817 if (*errorHandler == NULL)
1818 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 }
1820
Victor Stinner554f3f02010-06-16 23:33:54 +00001821 make_decode_exception(exceptionObject,
1822 encoding,
1823 *input, *inend - *input,
1824 *startinpos, *endinpos,
1825 reason);
1826 if (*exceptionObject == NULL)
1827 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828
1829 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1830 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001831 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001833 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 }
1836 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001837 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001838
1839 /* Copy back the bytes variables, which might have been modified by the
1840 callback */
1841 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1842 if (!inputobj)
1843 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001844 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001845 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001846 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001847 *input = PyBytes_AS_STRING(inputobj);
1848 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001849 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001850 /* we can DECREF safely, as the exception has another reference,
1851 so the object won't go away. */
1852 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001856 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1858 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001859 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860
1861 /* need more space? (at least enough for what we
1862 have+the replacement+the rest of the string (starting
1863 at the new input position), so we won't have to check space
1864 when there are no errors in the rest of the string) */
1865 repptr = PyUnicode_AS_UNICODE(repunicode);
1866 repsize = PyUnicode_GET_SIZE(repunicode);
1867 requiredsize = *outpos + repsize + insize-newpos;
1868 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001869 if (requiredsize<2*outsize)
1870 requiredsize = 2*outsize;
1871 if (_PyUnicode_Resize(output, requiredsize) < 0)
1872 goto onError;
1873 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 }
1875 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001876 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 Py_UNICODE_COPY(*outptr, repptr, repsize);
1878 *outptr += repsize;
1879 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 /* we made it! */
1882 res = 0;
1883
Benjamin Peterson29060642009-01-31 22:14:21 +00001884 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 Py_XDECREF(restuple);
1886 return res;
1887}
1888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001889/* --- UTF-7 Codec -------------------------------------------------------- */
1890
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891/* See RFC2152 for details. We encode conservatively and decode liberally. */
1892
1893/* Three simple macros defining base-64. */
1894
1895/* Is c a base-64 character? */
1896
1897#define IS_BASE64(c) \
1898 (((c) >= 'A' && (c) <= 'Z') || \
1899 ((c) >= 'a' && (c) <= 'z') || \
1900 ((c) >= '0' && (c) <= '9') || \
1901 (c) == '+' || (c) == '/')
1902
1903/* given that c is a base-64 character, what is its base-64 value? */
1904
1905#define FROM_BASE64(c) \
1906 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1907 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1908 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1909 (c) == '+' ? 62 : 63)
1910
1911/* What is the base-64 character of the bottom 6 bits of n? */
1912
1913#define TO_BASE64(n) \
1914 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1915
1916/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1917 * decoded as itself. We are permissive on decoding; the only ASCII
1918 * byte not decoding to itself is the + which begins a base64
1919 * string. */
1920
1921#define DECODE_DIRECT(c) \
1922 ((c) <= 127 && (c) != '+')
1923
1924/* The UTF-7 encoder treats ASCII characters differently according to
1925 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1926 * the above). See RFC2152. This array identifies these different
1927 * sets:
1928 * 0 : "Set D"
1929 * alphanumeric and '(),-./:?
1930 * 1 : "Set O"
1931 * !"#$%&*;<=>@[]^_`{|}
1932 * 2 : "whitespace"
1933 * ht nl cr sp
1934 * 3 : special (must be base64 encoded)
1935 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1936 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001937
Tim Petersced69f82003-09-16 20:30:58 +00001938static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001939char utf7_category[128] = {
1940/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1941 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1942/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1943 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1944/* sp ! " # $ % & ' ( ) * + , - . / */
1945 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1946/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1948/* @ A B C D E F G H I J K L M N O */
1949 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1952/* ` a b c d e f g h i j k l m n o */
1953 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1954/* p q r s t u v w x y z { | } ~ del */
1955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956};
1957
Antoine Pitrou244651a2009-05-04 18:56:13 +00001958/* ENCODE_DIRECT: this character should be encoded as itself. The
1959 * answer depends on whether we are encoding set O as itself, and also
1960 * on whether we are encoding whitespace as itself. RFC2152 makes it
1961 * clear that the answers to these questions vary between
1962 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001963
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964#define ENCODE_DIRECT(c, directO, directWS) \
1965 ((c) < 128 && (c) > 0 && \
1966 ((utf7_category[(c)] == 0) || \
1967 (directWS && (utf7_category[(c)] == 2)) || \
1968 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001970PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001971 Py_ssize_t size,
1972 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001973{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001974 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1975}
1976
Antoine Pitrou244651a2009-05-04 18:56:13 +00001977/* The decoder. The only state we preserve is our read position,
1978 * i.e. how many characters we have consumed. So if we end in the
1979 * middle of a shift sequence we have to back off the read position
1980 * and the output to the beginning of the sequence, otherwise we lose
1981 * all the shift state (seen bits, number of bits seen, high
1982 * surrogate). */
1983
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001984PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001985 Py_ssize_t size,
1986 const char *errors,
1987 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001988{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001990 Py_ssize_t startinpos;
1991 Py_ssize_t endinpos;
1992 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993 const char *e;
1994 PyUnicodeObject *unicode;
1995 Py_UNICODE *p;
1996 const char *errmsg = "";
1997 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001998 Py_UNICODE *shiftOutStart;
1999 unsigned int base64bits = 0;
2000 unsigned long base64buffer = 0;
2001 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 PyObject *errorHandler = NULL;
2003 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002004
2005 unicode = _PyUnicode_New(size);
2006 if (!unicode)
2007 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002008 if (size == 0) {
2009 if (consumed)
2010 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002012 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002013
2014 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002016 e = s + size;
2017
2018 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002020 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002021 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002022
Antoine Pitrou244651a2009-05-04 18:56:13 +00002023 if (inShift) { /* in a base-64 section */
2024 if (IS_BASE64(ch)) { /* consume a base-64 character */
2025 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2026 base64bits += 6;
2027 s++;
2028 if (base64bits >= 16) {
2029 /* we have enough bits for a UTF-16 value */
2030 Py_UNICODE outCh = (Py_UNICODE)
2031 (base64buffer >> (base64bits-16));
2032 base64bits -= 16;
2033 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2034 if (surrogate) {
2035 /* expecting a second surrogate */
2036 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2037#ifdef Py_UNICODE_WIDE
2038 *p++ = (((surrogate & 0x3FF)<<10)
2039 | (outCh & 0x3FF)) + 0x10000;
2040#else
2041 *p++ = surrogate;
2042 *p++ = outCh;
2043#endif
2044 surrogate = 0;
2045 }
2046 else {
2047 surrogate = 0;
2048 errmsg = "second surrogate missing";
2049 goto utf7Error;
2050 }
2051 }
2052 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2053 /* first surrogate */
2054 surrogate = outCh;
2055 }
2056 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2057 errmsg = "unexpected second surrogate";
2058 goto utf7Error;
2059 }
2060 else {
2061 *p++ = outCh;
2062 }
2063 }
2064 }
2065 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 inShift = 0;
2067 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002068 if (surrogate) {
2069 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002070 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002071 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002072 if (base64bits > 0) { /* left-over bits */
2073 if (base64bits >= 6) {
2074 /* We've seen at least one base-64 character */
2075 errmsg = "partial character in shift sequence";
2076 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002077 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 else {
2079 /* Some bits remain; they should be zero */
2080 if (base64buffer != 0) {
2081 errmsg = "non-zero padding bits in shift sequence";
2082 goto utf7Error;
2083 }
2084 }
2085 }
2086 if (ch != '-') {
2087 /* '-' is absorbed; other terminating
2088 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002089 *p++ = ch;
2090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002091 }
2092 }
2093 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095 s++; /* consume '+' */
2096 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002097 s++;
2098 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 }
2100 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002101 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002102 shiftOutStart = p;
2103 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002104 }
2105 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002106 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002107 *p++ = ch;
2108 s++;
2109 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002110 else {
2111 startinpos = s-starts;
2112 s++;
2113 errmsg = "unexpected special character";
2114 goto utf7Error;
2115 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002117utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002118 outpos = p-PyUnicode_AS_UNICODE(unicode);
2119 endinpos = s-starts;
2120 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 errors, &errorHandler,
2122 "utf7", errmsg,
2123 &starts, &e, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002126 }
2127
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 /* end of string */
2129
2130 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2131 /* if we're in an inconsistent state, that's an error */
2132 if (surrogate ||
2133 (base64bits >= 6) ||
2134 (base64bits > 0 && base64buffer != 0)) {
2135 outpos = p-PyUnicode_AS_UNICODE(unicode);
2136 endinpos = size;
2137 if (unicode_decode_call_errorhandler(
2138 errors, &errorHandler,
2139 "utf7", "unterminated shift sequence",
2140 &starts, &e, &startinpos, &endinpos, &exc, &s,
2141 &unicode, &outpos, &p))
2142 goto onError;
2143 if (s < e)
2144 goto restart;
2145 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002147
2148 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002149 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002150 if (inShift) {
2151 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002152 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002153 }
2154 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002155 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002157 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002158
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002159 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002160 goto onError;
2161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 Py_XDECREF(errorHandler);
2163 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 return (PyObject *)unicode;
2165
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002167 Py_XDECREF(errorHandler);
2168 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 Py_DECREF(unicode);
2170 return NULL;
2171}
2172
2173
2174PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176 int base64SetO,
2177 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002178 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002179{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002180 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002182 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002183 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002185 unsigned int base64bits = 0;
2186 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 char * out;
2188 char * start;
2189
2190 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002191 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002192
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002193 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002194 return PyErr_NoMemory();
2195
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002197 if (v == NULL)
2198 return NULL;
2199
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002200 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002201 for (;i < size; ++i) {
2202 Py_UNICODE ch = s[i];
2203
Antoine Pitrou244651a2009-05-04 18:56:13 +00002204 if (inShift) {
2205 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2206 /* shifting out */
2207 if (base64bits) { /* output remaining bits */
2208 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2209 base64buffer = 0;
2210 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211 }
2212 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 /* Characters not in the BASE64 set implicitly unshift the sequence
2214 so no '-' is required, except if the character is itself a '-' */
2215 if (IS_BASE64(ch) || ch == '-') {
2216 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002218 *out++ = (char) ch;
2219 }
2220 else {
2221 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002224 else { /* not in a shift sequence */
2225 if (ch == '+') {
2226 *out++ = '+';
2227 *out++ = '-';
2228 }
2229 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2230 *out++ = (char) ch;
2231 }
2232 else {
2233 *out++ = '+';
2234 inShift = 1;
2235 goto encode_char;
2236 }
2237 }
2238 continue;
2239encode_char:
2240#ifdef Py_UNICODE_WIDE
2241 if (ch >= 0x10000) {
2242 /* code first surrogate */
2243 base64bits += 16;
2244 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2245 while (base64bits >= 6) {
2246 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2247 base64bits -= 6;
2248 }
2249 /* prepare second surrogate */
2250 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2251 }
2252#endif
2253 base64bits += 16;
2254 base64buffer = (base64buffer << 16) | ch;
2255 while (base64bits >= 6) {
2256 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2257 base64bits -= 6;
2258 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002259 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002260 if (base64bits)
2261 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2262 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002264 if (_PyBytes_Resize(&v, out - start) < 0)
2265 return NULL;
2266 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002267}
2268
Antoine Pitrou244651a2009-05-04 18:56:13 +00002269#undef IS_BASE64
2270#undef FROM_BASE64
2271#undef TO_BASE64
2272#undef DECODE_DIRECT
2273#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002274
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275/* --- UTF-8 Codec -------------------------------------------------------- */
2276
Tim Petersced69f82003-09-16 20:30:58 +00002277static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002279 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2280 illegal prefix. See RFC 3629 for details */
2281 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2282 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002288 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2291 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2293 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2294 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2295 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2296 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297};
2298
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002300 Py_ssize_t size,
2301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
Walter Dörwald69652032004-09-07 20:24:22 +00002303 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2304}
2305
Antoine Pitrouab868312009-01-10 15:40:25 +00002306/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2307#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2308
2309/* Mask to quickly check whether a C 'long' contains a
2310 non-ASCII, UTF8-encoded char. */
2311#if (SIZEOF_LONG == 8)
2312# define ASCII_CHAR_MASK 0x8080808080808080L
2313#elif (SIZEOF_LONG == 4)
2314# define ASCII_CHAR_MASK 0x80808080L
2315#else
2316# error C 'long' size should be either 4 or 8!
2317#endif
2318
Walter Dörwald69652032004-09-07 20:24:22 +00002319PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002320 Py_ssize_t size,
2321 const char *errors,
2322 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002323{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002324 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002326 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002327 Py_ssize_t startinpos;
2328 Py_ssize_t endinpos;
2329 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002330 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 PyUnicodeObject *unicode;
2332 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002333 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002334 PyObject *errorHandler = NULL;
2335 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336
2337 /* Note: size will always be longer than the resulting Unicode
2338 character count */
2339 unicode = _PyUnicode_New(size);
2340 if (!unicode)
2341 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002342 if (size == 0) {
2343 if (consumed)
2344 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347
2348 /* Unpack UTF-8 encoded data */
2349 p = unicode->str;
2350 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002351 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352
2353 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002354 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355
2356 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002357 /* Fast path for runs of ASCII characters. Given that common UTF-8
2358 input will consist of an overwhelming majority of ASCII
2359 characters, we try to optimize for this case by checking
2360 as many characters as a C 'long' can contain.
2361 First, check if we can do an aligned read, as most CPUs have
2362 a penalty for unaligned reads.
2363 */
2364 if (!((size_t) s & LONG_PTR_MASK)) {
2365 /* Help register allocation */
2366 register const char *_s = s;
2367 register Py_UNICODE *_p = p;
2368 while (_s < aligned_end) {
2369 /* Read a whole long at a time (either 4 or 8 bytes),
2370 and do a fast unrolled copy if it only contains ASCII
2371 characters. */
2372 unsigned long data = *(unsigned long *) _s;
2373 if (data & ASCII_CHAR_MASK)
2374 break;
2375 _p[0] = (unsigned char) _s[0];
2376 _p[1] = (unsigned char) _s[1];
2377 _p[2] = (unsigned char) _s[2];
2378 _p[3] = (unsigned char) _s[3];
2379#if (SIZEOF_LONG == 8)
2380 _p[4] = (unsigned char) _s[4];
2381 _p[5] = (unsigned char) _s[5];
2382 _p[6] = (unsigned char) _s[6];
2383 _p[7] = (unsigned char) _s[7];
2384#endif
2385 _s += SIZEOF_LONG;
2386 _p += SIZEOF_LONG;
2387 }
2388 s = _s;
2389 p = _p;
2390 if (s == e)
2391 break;
2392 ch = (unsigned char)*s;
2393 }
2394 }
2395
2396 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002397 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 s++;
2399 continue;
2400 }
2401
2402 n = utf8_code_length[ch];
2403
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002404 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002405 if (consumed)
2406 break;
2407 else {
2408 errmsg = "unexpected end of data";
2409 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002410 endinpos = startinpos+1;
2411 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2412 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 goto utf8Error;
2414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
2417 switch (n) {
2418
2419 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002420 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 startinpos = s-starts;
2422 endinpos = startinpos+1;
2423 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424
2425 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002426 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 startinpos = s-starts;
2428 endinpos = startinpos+1;
2429 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430
2431 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002432 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002433 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002435 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 goto utf8Error;
2437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002439 assert ((ch > 0x007F) && (ch <= 0x07FF));
2440 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 break;
2442
2443 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002444 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2445 will result in surrogates in range d800-dfff. Surrogates are
2446 not valid UTF-8 so they are rejected.
2447 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2448 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002449 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002450 (s[2] & 0xc0) != 0x80 ||
2451 ((unsigned char)s[0] == 0xE0 &&
2452 (unsigned char)s[1] < 0xA0) ||
2453 ((unsigned char)s[0] == 0xED &&
2454 (unsigned char)s[1] > 0x9F)) {
2455 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002456 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002457 endinpos = startinpos + 1;
2458
2459 /* if s[1] first two bits are 1 and 0, then the invalid
2460 continuation byte is s[2], so increment endinpos by 1,
2461 if not, s[1] is invalid and endinpos doesn't need to
2462 be incremented. */
2463 if ((s[1] & 0xC0) == 0x80)
2464 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 goto utf8Error;
2466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002468 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2469 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002470 break;
2471
2472 case 4:
2473 if ((s[1] & 0xc0) != 0x80 ||
2474 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002475 (s[3] & 0xc0) != 0x80 ||
2476 ((unsigned char)s[0] == 0xF0 &&
2477 (unsigned char)s[1] < 0x90) ||
2478 ((unsigned char)s[0] == 0xF4 &&
2479 (unsigned char)s[1] > 0x8F)) {
2480 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002482 endinpos = startinpos + 1;
2483 if ((s[1] & 0xC0) == 0x80) {
2484 endinpos++;
2485 if ((s[2] & 0xC0) == 0x80)
2486 endinpos++;
2487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 goto utf8Error;
2489 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002490 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002491 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2492 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2493
Fredrik Lundh8f455852001-06-27 18:59:43 +00002494#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002496#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002497 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002498
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002499 /* translate from 10000..10FFFF to 0..FFFF */
2500 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002501
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002502 /* high surrogate = top 10 bits added to D800 */
2503 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002504
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002505 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002506 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002507#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 }
2510 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002511 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002512
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 utf8Error:
2514 outpos = p-PyUnicode_AS_UNICODE(unicode);
2515 if (unicode_decode_call_errorhandler(
2516 errors, &errorHandler,
2517 "utf8", errmsg,
2518 &starts, &e, &startinpos, &endinpos, &exc, &s,
2519 &unicode, &outpos, &p))
2520 goto onError;
2521 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 }
Walter Dörwald69652032004-09-07 20:24:22 +00002523 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525
2526 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002527 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 goto onError;
2529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 Py_XDECREF(errorHandler);
2531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 return (PyObject *)unicode;
2533
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_XDECREF(errorHandler);
2536 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 Py_DECREF(unicode);
2538 return NULL;
2539}
2540
Antoine Pitrouab868312009-01-10 15:40:25 +00002541#undef ASCII_CHAR_MASK
2542
2543
Tim Peters602f7402002-04-27 18:03:26 +00002544/* Allocation strategy: if the string is short, convert into a stack buffer
2545 and allocate exactly as much space needed at the end. Else allocate the
2546 maximum possible needed (4 result bytes per Unicode character), and return
2547 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002548*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002549PyObject *
2550PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 Py_ssize_t size,
2552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553{
Tim Peters602f7402002-04-27 18:03:26 +00002554#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002555
Guido van Rossum98297ee2007-11-06 21:34:58 +00002556 Py_ssize_t i; /* index into s of next input byte */
2557 PyObject *result; /* result string object */
2558 char *p; /* next free byte in output buffer */
2559 Py_ssize_t nallocated; /* number of result bytes allocated */
2560 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002561 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002562 PyObject *errorHandler = NULL;
2563 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002564
Tim Peters602f7402002-04-27 18:03:26 +00002565 assert(s != NULL);
2566 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567
Tim Peters602f7402002-04-27 18:03:26 +00002568 if (size <= MAX_SHORT_UNICHARS) {
2569 /* Write into the stack buffer; nallocated can't overflow.
2570 * At the end, we'll allocate exactly as much heap space as it
2571 * turns out we need.
2572 */
2573 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002574 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002575 p = stackbuf;
2576 }
2577 else {
2578 /* Overallocate on the heap, and give the excess back at the end. */
2579 nallocated = size * 4;
2580 if (nallocated / 4 != size) /* overflow! */
2581 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002582 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002583 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002584 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002585 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002586 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002587
Tim Peters602f7402002-04-27 18:03:26 +00002588 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002589 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002590
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002591 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002592 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002594
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002596 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002597 *p++ = (char)(0xc0 | (ch >> 6));
2598 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002599 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002600#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002601 /* Special case: check for high and low surrogate */
2602 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2603 Py_UCS4 ch2 = s[i];
2604 /* Combine the two surrogates to form a UCS4 value */
2605 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2606 i++;
2607
2608 /* Encode UCS4 Unicode ordinals */
2609 *p++ = (char)(0xf0 | (ch >> 18));
2610 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002611 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2612 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002613 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002614#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002615 Py_ssize_t newpos;
2616 PyObject *rep;
2617 Py_ssize_t repsize, k;
2618 rep = unicode_encode_call_errorhandler
2619 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2620 s, size, &exc, i-1, i, &newpos);
2621 if (!rep)
2622 goto error;
2623
2624 if (PyBytes_Check(rep))
2625 repsize = PyBytes_GET_SIZE(rep);
2626 else
2627 repsize = PyUnicode_GET_SIZE(rep);
2628
2629 if (repsize > 4) {
2630 Py_ssize_t offset;
2631
2632 if (result == NULL)
2633 offset = p - stackbuf;
2634 else
2635 offset = p - PyBytes_AS_STRING(result);
2636
2637 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2638 /* integer overflow */
2639 PyErr_NoMemory();
2640 goto error;
2641 }
2642 nallocated += repsize - 4;
2643 if (result != NULL) {
2644 if (_PyBytes_Resize(&result, nallocated) < 0)
2645 goto error;
2646 } else {
2647 result = PyBytes_FromStringAndSize(NULL, nallocated);
2648 if (result == NULL)
2649 goto error;
2650 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2651 }
2652 p = PyBytes_AS_STRING(result) + offset;
2653 }
2654
2655 if (PyBytes_Check(rep)) {
2656 char *prep = PyBytes_AS_STRING(rep);
2657 for(k = repsize; k > 0; k--)
2658 *p++ = *prep++;
2659 } else /* rep is unicode */ {
2660 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2661 Py_UNICODE c;
2662
2663 for(k=0; k<repsize; k++) {
2664 c = prep[k];
2665 if (0x80 <= c) {
2666 raise_encode_exception(&exc, "utf-8", s, size,
2667 i-1, i, "surrogates not allowed");
2668 goto error;
2669 }
2670 *p++ = (char)prep[k];
2671 }
2672 }
2673 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002674#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002675 }
Victor Stinner445a6232010-04-22 20:01:57 +00002676#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002677 } else if (ch < 0x10000) {
2678 *p++ = (char)(0xe0 | (ch >> 12));
2679 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2680 *p++ = (char)(0x80 | (ch & 0x3f));
2681 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002682 /* Encode UCS4 Unicode ordinals */
2683 *p++ = (char)(0xf0 | (ch >> 18));
2684 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2685 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2686 *p++ = (char)(0x80 | (ch & 0x3f));
2687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002689
Guido van Rossum98297ee2007-11-06 21:34:58 +00002690 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002691 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002692 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002693 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002694 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002695 }
2696 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002697 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002698 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002699 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002700 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002701 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002702 Py_XDECREF(errorHandler);
2703 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002704 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002705 error:
2706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
2708 Py_XDECREF(result);
2709 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002710
Tim Peters602f7402002-04-27 18:03:26 +00002711#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712}
2713
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2715{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2719 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002720 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 PyUnicode_GET_SIZE(unicode),
2722 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723}
2724
Walter Dörwald41980ca2007-08-16 21:55:45 +00002725/* --- UTF-32 Codec ------------------------------------------------------- */
2726
2727PyObject *
2728PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 Py_ssize_t size,
2730 const char *errors,
2731 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002732{
2733 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2734}
2735
2736PyObject *
2737PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 Py_ssize_t size,
2739 const char *errors,
2740 int *byteorder,
2741 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742{
2743 const char *starts = s;
2744 Py_ssize_t startinpos;
2745 Py_ssize_t endinpos;
2746 Py_ssize_t outpos;
2747 PyUnicodeObject *unicode;
2748 Py_UNICODE *p;
2749#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002750 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002751 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002752#else
2753 const int pairs = 0;
2754#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002755 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002756 int bo = 0; /* assume native ordering by default */
2757 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002758 /* Offsets from q for retrieving bytes in the right order. */
2759#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2760 int iorder[] = {0, 1, 2, 3};
2761#else
2762 int iorder[] = {3, 2, 1, 0};
2763#endif
2764 PyObject *errorHandler = NULL;
2765 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002766
Walter Dörwald41980ca2007-08-16 21:55:45 +00002767 q = (unsigned char *)s;
2768 e = q + size;
2769
2770 if (byteorder)
2771 bo = *byteorder;
2772
2773 /* Check for BOM marks (U+FEFF) in the input and adjust current
2774 byte order setting accordingly. In native mode, the leading BOM
2775 mark is skipped, in all other modes, it is copied to the output
2776 stream as-is (giving a ZWNBSP character). */
2777 if (bo == 0) {
2778 if (size >= 4) {
2779 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002781#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 if (bom == 0x0000FEFF) {
2783 q += 4;
2784 bo = -1;
2785 }
2786 else if (bom == 0xFFFE0000) {
2787 q += 4;
2788 bo = 1;
2789 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002791 if (bom == 0x0000FEFF) {
2792 q += 4;
2793 bo = 1;
2794 }
2795 else if (bom == 0xFFFE0000) {
2796 q += 4;
2797 bo = -1;
2798 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002801 }
2802
2803 if (bo == -1) {
2804 /* force LE */
2805 iorder[0] = 0;
2806 iorder[1] = 1;
2807 iorder[2] = 2;
2808 iorder[3] = 3;
2809 }
2810 else if (bo == 1) {
2811 /* force BE */
2812 iorder[0] = 3;
2813 iorder[1] = 2;
2814 iorder[2] = 1;
2815 iorder[3] = 0;
2816 }
2817
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002818 /* On narrow builds we split characters outside the BMP into two
2819 codepoints => count how much extra space we need. */
2820#ifndef Py_UNICODE_WIDE
2821 for (qq = q; qq < e; qq += 4)
2822 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2823 pairs++;
2824#endif
2825
2826 /* This might be one to much, because of a BOM */
2827 unicode = _PyUnicode_New((size+3)/4+pairs);
2828 if (!unicode)
2829 return NULL;
2830 if (size == 0)
2831 return (PyObject *)unicode;
2832
2833 /* Unpack UTF-32 encoded data */
2834 p = unicode->str;
2835
Walter Dörwald41980ca2007-08-16 21:55:45 +00002836 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 Py_UCS4 ch;
2838 /* remaining bytes at the end? (size should be divisible by 4) */
2839 if (e-q<4) {
2840 if (consumed)
2841 break;
2842 errmsg = "truncated data";
2843 startinpos = ((const char *)q)-starts;
2844 endinpos = ((const char *)e)-starts;
2845 goto utf32Error;
2846 /* The remaining input chars are ignored if the callback
2847 chooses to skip the input */
2848 }
2849 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2850 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002851
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 if (ch >= 0x110000)
2853 {
2854 errmsg = "codepoint not in range(0x110000)";
2855 startinpos = ((const char *)q)-starts;
2856 endinpos = startinpos+4;
2857 goto utf32Error;
2858 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002859#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 if (ch >= 0x10000)
2861 {
2862 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2863 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2864 }
2865 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002866#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002867 *p++ = ch;
2868 q += 4;
2869 continue;
2870 utf32Error:
2871 outpos = p-PyUnicode_AS_UNICODE(unicode);
2872 if (unicode_decode_call_errorhandler(
2873 errors, &errorHandler,
2874 "utf32", errmsg,
2875 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2876 &unicode, &outpos, &p))
2877 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002878 }
2879
2880 if (byteorder)
2881 *byteorder = bo;
2882
2883 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002885
2886 /* Adjust length */
2887 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2888 goto onError;
2889
2890 Py_XDECREF(errorHandler);
2891 Py_XDECREF(exc);
2892 return (PyObject *)unicode;
2893
Benjamin Peterson29060642009-01-31 22:14:21 +00002894 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895 Py_DECREF(unicode);
2896 Py_XDECREF(errorHandler);
2897 Py_XDECREF(exc);
2898 return NULL;
2899}
2900
2901PyObject *
2902PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 Py_ssize_t size,
2904 const char *errors,
2905 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002906{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002907 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002908 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002909 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002910#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002911 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002912#else
2913 const int pairs = 0;
2914#endif
2915 /* Offsets from p for storing byte pairs in the right order. */
2916#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2917 int iorder[] = {0, 1, 2, 3};
2918#else
2919 int iorder[] = {3, 2, 1, 0};
2920#endif
2921
Benjamin Peterson29060642009-01-31 22:14:21 +00002922#define STORECHAR(CH) \
2923 do { \
2924 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2925 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2926 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2927 p[iorder[0]] = (CH) & 0xff; \
2928 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002929 } while(0)
2930
2931 /* In narrow builds we can output surrogate pairs as one codepoint,
2932 so we need less space. */
2933#ifndef Py_UNICODE_WIDE
2934 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2936 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2937 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002938#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002939 nsize = (size - pairs + (byteorder == 0));
2940 bytesize = nsize * 4;
2941 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002943 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002944 if (v == NULL)
2945 return NULL;
2946
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002947 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002948 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002950 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002951 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002952
2953 if (byteorder == -1) {
2954 /* force LE */
2955 iorder[0] = 0;
2956 iorder[1] = 1;
2957 iorder[2] = 2;
2958 iorder[3] = 3;
2959 }
2960 else if (byteorder == 1) {
2961 /* force BE */
2962 iorder[0] = 3;
2963 iorder[1] = 2;
2964 iorder[2] = 1;
2965 iorder[3] = 0;
2966 }
2967
2968 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002970#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002971 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2972 Py_UCS4 ch2 = *s;
2973 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2974 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2975 s++;
2976 size--;
2977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002978 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002979#endif
2980 STORECHAR(ch);
2981 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002982
2983 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002984 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002985#undef STORECHAR
2986}
2987
2988PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2989{
2990 if (!PyUnicode_Check(unicode)) {
2991 PyErr_BadArgument();
2992 return NULL;
2993 }
2994 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyUnicode_GET_SIZE(unicode),
2996 NULL,
2997 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002998}
2999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000/* --- UTF-16 Codec ------------------------------------------------------- */
3001
Tim Peters772747b2001-08-09 22:21:55 +00003002PyObject *
3003PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 Py_ssize_t size,
3005 const char *errors,
3006 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007{
Walter Dörwald69652032004-09-07 20:24:22 +00003008 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3009}
3010
Antoine Pitrouab868312009-01-10 15:40:25 +00003011/* Two masks for fast checking of whether a C 'long' may contain
3012 UTF16-encoded surrogate characters. This is an efficient heuristic,
3013 assuming that non-surrogate characters with a code point >= 0x8000 are
3014 rare in most input.
3015 FAST_CHAR_MASK is used when the input is in native byte ordering,
3016 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003017*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003018#if (SIZEOF_LONG == 8)
3019# define FAST_CHAR_MASK 0x8000800080008000L
3020# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3021#elif (SIZEOF_LONG == 4)
3022# define FAST_CHAR_MASK 0x80008000L
3023# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3024#else
3025# error C 'long' size should be either 4 or 8!
3026#endif
3027
Walter Dörwald69652032004-09-07 20:24:22 +00003028PyObject *
3029PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 Py_ssize_t size,
3031 const char *errors,
3032 int *byteorder,
3033 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003034{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003036 Py_ssize_t startinpos;
3037 Py_ssize_t endinpos;
3038 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 PyUnicodeObject *unicode;
3040 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003041 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003042 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003043 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003044 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003045 /* Offsets from q for retrieving byte pairs in the right order. */
3046#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3047 int ihi = 1, ilo = 0;
3048#else
3049 int ihi = 0, ilo = 1;
3050#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 PyObject *errorHandler = NULL;
3052 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
3054 /* Note: size will always be longer than the resulting Unicode
3055 character count */
3056 unicode = _PyUnicode_New(size);
3057 if (!unicode)
3058 return NULL;
3059 if (size == 0)
3060 return (PyObject *)unicode;
3061
3062 /* Unpack UTF-16 encoded data */
3063 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003064 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003065 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066
3067 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003068 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003070 /* Check for BOM marks (U+FEFF) in the input and adjust current
3071 byte order setting accordingly. In native mode, the leading BOM
3072 mark is skipped, in all other modes, it is copied to the output
3073 stream as-is (giving a ZWNBSP character). */
3074 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003075 if (size >= 2) {
3076 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003077#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 if (bom == 0xFEFF) {
3079 q += 2;
3080 bo = -1;
3081 }
3082 else if (bom == 0xFFFE) {
3083 q += 2;
3084 bo = 1;
3085 }
Tim Petersced69f82003-09-16 20:30:58 +00003086#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 if (bom == 0xFEFF) {
3088 q += 2;
3089 bo = 1;
3090 }
3091 else if (bom == 0xFFFE) {
3092 q += 2;
3093 bo = -1;
3094 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003095#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098
Tim Peters772747b2001-08-09 22:21:55 +00003099 if (bo == -1) {
3100 /* force LE */
3101 ihi = 1;
3102 ilo = 0;
3103 }
3104 else if (bo == 1) {
3105 /* force BE */
3106 ihi = 0;
3107 ilo = 1;
3108 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3110 native_ordering = ilo < ihi;
3111#else
3112 native_ordering = ilo > ihi;
3113#endif
Tim Peters772747b2001-08-09 22:21:55 +00003114
Antoine Pitrouab868312009-01-10 15:40:25 +00003115 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003116 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003117 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003118 /* First check for possible aligned read of a C 'long'. Unaligned
3119 reads are more expensive, better to defer to another iteration. */
3120 if (!((size_t) q & LONG_PTR_MASK)) {
3121 /* Fast path for runs of non-surrogate chars. */
3122 register const unsigned char *_q = q;
3123 Py_UNICODE *_p = p;
3124 if (native_ordering) {
3125 /* Native ordering is simple: as long as the input cannot
3126 possibly contain a surrogate char, do an unrolled copy
3127 of several 16-bit code points to the target object.
3128 The non-surrogate check is done on several input bytes
3129 at a time (as many as a C 'long' can contain). */
3130 while (_q < aligned_end) {
3131 unsigned long data = * (unsigned long *) _q;
3132 if (data & FAST_CHAR_MASK)
3133 break;
3134 _p[0] = ((unsigned short *) _q)[0];
3135 _p[1] = ((unsigned short *) _q)[1];
3136#if (SIZEOF_LONG == 8)
3137 _p[2] = ((unsigned short *) _q)[2];
3138 _p[3] = ((unsigned short *) _q)[3];
3139#endif
3140 _q += SIZEOF_LONG;
3141 _p += SIZEOF_LONG / 2;
3142 }
3143 }
3144 else {
3145 /* Byteswapped ordering is similar, but we must decompose
3146 the copy bytewise, and take care of zero'ing out the
3147 upper bytes if the target object is in 32-bit units
3148 (that is, in UCS-4 builds). */
3149 while (_q < aligned_end) {
3150 unsigned long data = * (unsigned long *) _q;
3151 if (data & SWAPPED_FAST_CHAR_MASK)
3152 break;
3153 /* Zero upper bytes in UCS-4 builds */
3154#if (Py_UNICODE_SIZE > 2)
3155 _p[0] = 0;
3156 _p[1] = 0;
3157#if (SIZEOF_LONG == 8)
3158 _p[2] = 0;
3159 _p[3] = 0;
3160#endif
3161#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003162 /* Issue #4916; UCS-4 builds on big endian machines must
3163 fill the two last bytes of each 4-byte unit. */
3164#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3165# define OFF 2
3166#else
3167# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003168#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003169 ((unsigned char *) _p)[OFF + 1] = _q[0];
3170 ((unsigned char *) _p)[OFF + 0] = _q[1];
3171 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3172 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3173#if (SIZEOF_LONG == 8)
3174 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3175 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3176 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3177 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3178#endif
3179#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003180 _q += SIZEOF_LONG;
3181 _p += SIZEOF_LONG / 2;
3182 }
3183 }
3184 p = _p;
3185 q = _q;
3186 if (q >= e)
3187 break;
3188 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190
Benjamin Peterson14339b62009-01-31 16:36:08 +00003191 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003192
3193 if (ch < 0xD800 || ch > 0xDFFF) {
3194 *p++ = ch;
3195 continue;
3196 }
3197
3198 /* UTF-16 code pair: */
3199 if (q > e) {
3200 errmsg = "unexpected end of data";
3201 startinpos = (((const char *)q) - 2) - starts;
3202 endinpos = ((const char *)e) + 1 - starts;
3203 goto utf16Error;
3204 }
3205 if (0xD800 <= ch && ch <= 0xDBFF) {
3206 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3207 q += 2;
3208 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003209#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 *p++ = ch;
3211 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003212#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003214#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 continue;
3216 }
3217 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003218 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 startinpos = (((const char *)q)-4)-starts;
3220 endinpos = startinpos+2;
3221 goto utf16Error;
3222 }
3223
Benjamin Peterson14339b62009-01-31 16:36:08 +00003224 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 errmsg = "illegal encoding";
3226 startinpos = (((const char *)q)-2)-starts;
3227 endinpos = startinpos+2;
3228 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 utf16Error:
3231 outpos = p - PyUnicode_AS_UNICODE(unicode);
3232 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003233 errors,
3234 &errorHandler,
3235 "utf16", errmsg,
3236 &starts,
3237 (const char **)&e,
3238 &startinpos,
3239 &endinpos,
3240 &exc,
3241 (const char **)&q,
3242 &unicode,
3243 &outpos,
3244 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003247 /* remaining byte at the end? (size should be even) */
3248 if (e == q) {
3249 if (!consumed) {
3250 errmsg = "truncated data";
3251 startinpos = ((const char *)q) - starts;
3252 endinpos = ((const char *)e) + 1 - starts;
3253 outpos = p - PyUnicode_AS_UNICODE(unicode);
3254 if (unicode_decode_call_errorhandler(
3255 errors,
3256 &errorHandler,
3257 "utf16", errmsg,
3258 &starts,
3259 (const char **)&e,
3260 &startinpos,
3261 &endinpos,
3262 &exc,
3263 (const char **)&q,
3264 &unicode,
3265 &outpos,
3266 &p))
3267 goto onError;
3268 /* The remaining input chars are ignored if the callback
3269 chooses to skip the input */
3270 }
3271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272
3273 if (byteorder)
3274 *byteorder = bo;
3275
Walter Dörwald69652032004-09-07 20:24:22 +00003276 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003278
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003280 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 goto onError;
3282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 Py_XDECREF(errorHandler);
3284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 return (PyObject *)unicode;
3286
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 Py_XDECREF(errorHandler);
3290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 return NULL;
3292}
3293
Antoine Pitrouab868312009-01-10 15:40:25 +00003294#undef FAST_CHAR_MASK
3295#undef SWAPPED_FAST_CHAR_MASK
3296
Tim Peters772747b2001-08-09 22:21:55 +00003297PyObject *
3298PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 Py_ssize_t size,
3300 const char *errors,
3301 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003303 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003304 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003305 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003306#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003307 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003308#else
3309 const int pairs = 0;
3310#endif
Tim Peters772747b2001-08-09 22:21:55 +00003311 /* Offsets from p for storing byte pairs in the right order. */
3312#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3313 int ihi = 1, ilo = 0;
3314#else
3315 int ihi = 0, ilo = 1;
3316#endif
3317
Benjamin Peterson29060642009-01-31 22:14:21 +00003318#define STORECHAR(CH) \
3319 do { \
3320 p[ihi] = ((CH) >> 8) & 0xff; \
3321 p[ilo] = (CH) & 0xff; \
3322 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003323 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003325#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003326 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 if (s[i] >= 0x10000)
3328 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003329#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003330 /* 2 * (size + pairs + (byteorder == 0)) */
3331 if (size > PY_SSIZE_T_MAX ||
3332 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003334 nsize = size + pairs + (byteorder == 0);
3335 bytesize = nsize * 2;
3336 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003338 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 if (v == NULL)
3340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003342 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003345 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003346 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003347
3348 if (byteorder == -1) {
3349 /* force LE */
3350 ihi = 1;
3351 ilo = 0;
3352 }
3353 else if (byteorder == 1) {
3354 /* force BE */
3355 ihi = 0;
3356 ilo = 1;
3357 }
3358
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003359 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 Py_UNICODE ch = *s++;
3361 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003362#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 if (ch >= 0x10000) {
3364 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3365 ch = 0xD800 | ((ch-0x10000) >> 10);
3366 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003367#endif
Tim Peters772747b2001-08-09 22:21:55 +00003368 STORECHAR(ch);
3369 if (ch2)
3370 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003371 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003372
3373 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003374 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003375#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376}
3377
3378PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3379{
3380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
3382 return NULL;
3383 }
3384 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 PyUnicode_GET_SIZE(unicode),
3386 NULL,
3387 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388}
3389
3390/* --- Unicode Escape Codec ----------------------------------------------- */
3391
Fredrik Lundh06d12682001-01-24 07:59:11 +00003392static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003393
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 Py_ssize_t size,
3396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003399 Py_ssize_t startinpos;
3400 Py_ssize_t endinpos;
3401 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003406 char* message;
3407 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 PyObject *errorHandler = NULL;
3409 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003410
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 /* Escaped strings will always be longer than the resulting
3412 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 length after conversion to the true value.
3414 (but if the error callback returns a long replacement string
3415 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 v = _PyUnicode_New(size);
3417 if (v == NULL)
3418 goto onError;
3419 if (size == 0)
3420 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003424
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 while (s < end) {
3426 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003427 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429
3430 /* Non-escape characters are interpreted as Unicode ordinals */
3431 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003432 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 continue;
3434 }
3435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 /* \ - Escapes */
3438 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003439 c = *s++;
3440 if (s > end)
3441 c = '\0'; /* Invalid after \ */
3442 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 case '\n': break;
3446 case '\\': *p++ = '\\'; break;
3447 case '\'': *p++ = '\''; break;
3448 case '\"': *p++ = '\"'; break;
3449 case 'b': *p++ = '\b'; break;
3450 case 'f': *p++ = '\014'; break; /* FF */
3451 case 't': *p++ = '\t'; break;
3452 case 'n': *p++ = '\n'; break;
3453 case 'r': *p++ = '\r'; break;
3454 case 'v': *p++ = '\013'; break; /* VT */
3455 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3456
Benjamin Peterson29060642009-01-31 22:14:21 +00003457 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 case '0': case '1': case '2': case '3':
3459 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003460 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003461 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003462 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003463 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003464 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003466 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 break;
3468
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 /* hex escapes */
3470 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003472 digits = 2;
3473 message = "truncated \\xXX escape";
3474 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475
Benjamin Peterson29060642009-01-31 22:14:21 +00003476 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003478 digits = 4;
3479 message = "truncated \\uXXXX escape";
3480 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003483 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003484 digits = 8;
3485 message = "truncated \\UXXXXXXXX escape";
3486 hexescape:
3487 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 outpos = p-PyUnicode_AS_UNICODE(v);
3489 if (s+digits>end) {
3490 endinpos = size;
3491 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 errors, &errorHandler,
3493 "unicodeescape", "end of string in escape sequence",
3494 &starts, &end, &startinpos, &endinpos, &exc, &s,
3495 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 goto onError;
3497 goto nextByte;
3498 }
3499 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003500 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003501 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 endinpos = (s+i+1)-starts;
3503 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 errors, &errorHandler,
3505 "unicodeescape", message,
3506 &starts, &end, &startinpos, &endinpos, &exc, &s,
3507 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003508 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003510 }
3511 chr = (chr<<4) & ~0xF;
3512 if (c >= '0' && c <= '9')
3513 chr += c - '0';
3514 else if (c >= 'a' && c <= 'f')
3515 chr += 10 + c - 'a';
3516 else
3517 chr += 10 + c - 'A';
3518 }
3519 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003520 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 /* _decoding_error will have already written into the
3522 target buffer. */
3523 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003524 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003525 /* when we get here, chr is a 32-bit unicode character */
3526 if (chr <= 0xffff)
3527 /* UCS-2 character */
3528 *p++ = (Py_UNICODE) chr;
3529 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003530 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003531 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003532#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003533 *p++ = chr;
3534#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003535 chr -= 0x10000L;
3536 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003537 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003538#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003539 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 endinpos = s-starts;
3541 outpos = p-PyUnicode_AS_UNICODE(v);
3542 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 errors, &errorHandler,
3544 "unicodeescape", "illegal Unicode character",
3545 &starts, &end, &startinpos, &endinpos, &exc, &s,
3546 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003547 goto onError;
3548 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003549 break;
3550
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003552 case 'N':
3553 message = "malformed \\N character escape";
3554 if (ucnhash_CAPI == NULL) {
3555 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003556 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003557 if (ucnhash_CAPI == NULL)
3558 goto ucnhashError;
3559 }
3560 if (*s == '{') {
3561 const char *start = s+1;
3562 /* look for the closing brace */
3563 while (*s != '}' && s < end)
3564 s++;
3565 if (s > start && s < end && *s == '}') {
3566 /* found a name. look it up in the unicode database */
3567 message = "unknown Unicode character name";
3568 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003569 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003570 goto store;
3571 }
3572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 endinpos = s-starts;
3574 outpos = p-PyUnicode_AS_UNICODE(v);
3575 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003576 errors, &errorHandler,
3577 "unicodeescape", message,
3578 &starts, &end, &startinpos, &endinpos, &exc, &s,
3579 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003580 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003581 break;
3582
3583 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003584 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 message = "\\ at end of string";
3586 s--;
3587 endinpos = s-starts;
3588 outpos = p-PyUnicode_AS_UNICODE(v);
3589 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 errors, &errorHandler,
3591 "unicodeescape", message,
3592 &starts, &end, &startinpos, &endinpos, &exc, &s,
3593 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003594 goto onError;
3595 }
3596 else {
3597 *p++ = '\\';
3598 *p++ = (unsigned char)s[-1];
3599 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003600 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003605 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003607 Py_XDECREF(errorHandler);
3608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003610
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003612 PyErr_SetString(
3613 PyExc_UnicodeError,
3614 "\\N escapes not supported (can't load unicodedata module)"
3615 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003616 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 Py_XDECREF(errorHandler);
3618 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003619 return NULL;
3620
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 Py_XDECREF(errorHandler);
3624 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 return NULL;
3626}
3627
3628/* Return a Unicode-Escape string version of the Unicode object.
3629
3630 If quotes is true, the string is enclosed in u"" or u'' quotes as
3631 appropriate.
3632
3633*/
3634
Thomas Wouters477c8d52006-05-27 19:21:47 +00003635Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 Py_ssize_t size,
3637 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003638{
3639 /* like wcschr, but doesn't stop at NULL characters */
3640
3641 while (size-- > 0) {
3642 if (*s == ch)
3643 return s;
3644 s++;
3645 }
3646
3647 return NULL;
3648}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003649
Walter Dörwald79e913e2007-05-12 11:08:06 +00003650static const char *hexdigits = "0123456789abcdef";
3651
3652PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003655 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003658#ifdef Py_UNICODE_WIDE
3659 const Py_ssize_t expandsize = 10;
3660#else
3661 const Py_ssize_t expandsize = 6;
3662#endif
3663
Thomas Wouters89f507f2006-12-13 04:49:30 +00003664 /* XXX(nnorwitz): rather than over-allocating, it would be
3665 better to choose a different scheme. Perhaps scan the
3666 first N-chars of the string and allocate based on that size.
3667 */
3668 /* Initial allocation is based on the longest-possible unichr
3669 escape.
3670
3671 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3672 unichr, so in this case it's the longest unichr escape. In
3673 narrow (UTF-16) builds this is five chars per source unichr
3674 since there are two unichrs in the surrogate pair, so in narrow
3675 (UTF-16) builds it's not the longest unichr escape.
3676
3677 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3678 so in the narrow (UTF-16) build case it's the longest unichr
3679 escape.
3680 */
3681
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003682 if (size == 0)
3683 return PyBytes_FromStringAndSize(NULL, 0);
3684
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003685 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003687
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003688 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 2
3690 + expandsize*size
3691 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 if (repr == NULL)
3693 return NULL;
3694
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003695 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 while (size-- > 0) {
3698 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003699
Walter Dörwald79e913e2007-05-12 11:08:06 +00003700 /* Escape backslashes */
3701 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 *p++ = '\\';
3703 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003704 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003705 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003706
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003707#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003708 /* Map 21-bit characters to '\U00xxxxxx' */
3709 else if (ch >= 0x10000) {
3710 *p++ = '\\';
3711 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003712 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3713 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3714 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3715 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3716 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3717 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3718 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3719 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003721 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003722#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3724 else if (ch >= 0xD800 && ch < 0xDC00) {
3725 Py_UNICODE ch2;
3726 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003727
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 ch2 = *s++;
3729 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003730 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3732 *p++ = '\\';
3733 *p++ = 'U';
3734 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3735 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3736 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3737 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3738 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3739 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3740 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3741 *p++ = hexdigits[ucs & 0x0000000F];
3742 continue;
3743 }
3744 /* Fall through: isolated surrogates are copied as-is */
3745 s--;
3746 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003747 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003748#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003749
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003751 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 *p++ = '\\';
3753 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003754 *p++ = hexdigits[(ch >> 12) & 0x000F];
3755 *p++ = hexdigits[(ch >> 8) & 0x000F];
3756 *p++ = hexdigits[(ch >> 4) & 0x000F];
3757 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003759
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003760 /* Map special whitespace to '\t', \n', '\r' */
3761 else if (ch == '\t') {
3762 *p++ = '\\';
3763 *p++ = 't';
3764 }
3765 else if (ch == '\n') {
3766 *p++ = '\\';
3767 *p++ = 'n';
3768 }
3769 else if (ch == '\r') {
3770 *p++ = '\\';
3771 *p++ = 'r';
3772 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003773
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003774 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003775 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003777 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003778 *p++ = hexdigits[(ch >> 4) & 0x000F];
3779 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003780 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003781
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 /* Copy everything else as-is */
3783 else
3784 *p++ = (char) ch;
3785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003787 assert(p - PyBytes_AS_STRING(repr) > 0);
3788 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3789 return NULL;
3790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791}
3792
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003793PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003795 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 if (!PyUnicode_Check(unicode)) {
3797 PyErr_BadArgument();
3798 return NULL;
3799 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003800 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3801 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003802 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803}
3804
3805/* --- Raw Unicode Escape Codec ------------------------------------------- */
3806
3807PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 Py_ssize_t size,
3809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003812 Py_ssize_t startinpos;
3813 Py_ssize_t endinpos;
3814 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 const char *end;
3818 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 PyObject *errorHandler = NULL;
3820 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003821
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 /* Escaped strings will always be longer than the resulting
3823 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 length after conversion to the true value. (But decoding error
3825 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 v = _PyUnicode_New(size);
3827 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 end = s + size;
3833 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 unsigned char c;
3835 Py_UCS4 x;
3836 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003837 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 /* Non-escape characters are interpreted as Unicode ordinals */
3840 if (*s != '\\') {
3841 *p++ = (unsigned char)*s++;
3842 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003843 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003844 startinpos = s-starts;
3845
3846 /* \u-escapes are only interpreted iff the number of leading
3847 backslashes if odd */
3848 bs = s;
3849 for (;s < end;) {
3850 if (*s != '\\')
3851 break;
3852 *p++ = (unsigned char)*s++;
3853 }
3854 if (((s - bs) & 1) == 0 ||
3855 s >= end ||
3856 (*s != 'u' && *s != 'U')) {
3857 continue;
3858 }
3859 p--;
3860 count = *s=='u' ? 4 : 8;
3861 s++;
3862
3863 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3864 outpos = p-PyUnicode_AS_UNICODE(v);
3865 for (x = 0, i = 0; i < count; ++i, ++s) {
3866 c = (unsigned char)*s;
3867 if (!ISXDIGIT(c)) {
3868 endinpos = s-starts;
3869 if (unicode_decode_call_errorhandler(
3870 errors, &errorHandler,
3871 "rawunicodeescape", "truncated \\uXXXX",
3872 &starts, &end, &startinpos, &endinpos, &exc, &s,
3873 &v, &outpos, &p))
3874 goto onError;
3875 goto nextByte;
3876 }
3877 x = (x<<4) & ~0xF;
3878 if (c >= '0' && c <= '9')
3879 x += c - '0';
3880 else if (c >= 'a' && c <= 'f')
3881 x += 10 + c - 'a';
3882 else
3883 x += 10 + c - 'A';
3884 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003885 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 /* UCS-2 character */
3887 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003888 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003889 /* UCS-4 character. Either store directly, or as
3890 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003891#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003893#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 x -= 0x10000L;
3895 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3896 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003897#endif
3898 } else {
3899 endinpos = s-starts;
3900 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003901 if (unicode_decode_call_errorhandler(
3902 errors, &errorHandler,
3903 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 &starts, &end, &startinpos, &endinpos, &exc, &s,
3905 &v, &outpos, &p))
3906 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003907 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 nextByte:
3909 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003911 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 Py_XDECREF(errorHandler);
3914 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003916
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 Py_XDECREF(errorHandler);
3920 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return NULL;
3922}
3923
3924PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003927 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 char *p;
3929 char *q;
3930
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003931#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003932 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003933#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003934 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003935#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003936
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003937 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003939
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003940 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 if (repr == NULL)
3942 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003943 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003944 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003946 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 while (size-- > 0) {
3948 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003949#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 /* Map 32-bit characters to '\Uxxxxxxxx' */
3951 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003952 *p++ = '\\';
3953 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003954 *p++ = hexdigits[(ch >> 28) & 0xf];
3955 *p++ = hexdigits[(ch >> 24) & 0xf];
3956 *p++ = hexdigits[(ch >> 20) & 0xf];
3957 *p++ = hexdigits[(ch >> 16) & 0xf];
3958 *p++ = hexdigits[(ch >> 12) & 0xf];
3959 *p++ = hexdigits[(ch >> 8) & 0xf];
3960 *p++ = hexdigits[(ch >> 4) & 0xf];
3961 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003962 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003963 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003964#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3966 if (ch >= 0xD800 && ch < 0xDC00) {
3967 Py_UNICODE ch2;
3968 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003969
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 ch2 = *s++;
3971 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003972 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3974 *p++ = '\\';
3975 *p++ = 'U';
3976 *p++ = hexdigits[(ucs >> 28) & 0xf];
3977 *p++ = hexdigits[(ucs >> 24) & 0xf];
3978 *p++ = hexdigits[(ucs >> 20) & 0xf];
3979 *p++ = hexdigits[(ucs >> 16) & 0xf];
3980 *p++ = hexdigits[(ucs >> 12) & 0xf];
3981 *p++ = hexdigits[(ucs >> 8) & 0xf];
3982 *p++ = hexdigits[(ucs >> 4) & 0xf];
3983 *p++ = hexdigits[ucs & 0xf];
3984 continue;
3985 }
3986 /* Fall through: isolated surrogates are copied as-is */
3987 s--;
3988 size++;
3989 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003990#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 /* Map 16-bit characters to '\uxxxx' */
3992 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 *p++ = '\\';
3994 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003995 *p++ = hexdigits[(ch >> 12) & 0xf];
3996 *p++ = hexdigits[(ch >> 8) & 0xf];
3997 *p++ = hexdigits[(ch >> 4) & 0xf];
3998 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 /* Copy everything else as-is */
4001 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 *p++ = (char) ch;
4003 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004004 size = p - q;
4005
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004006 assert(size > 0);
4007 if (_PyBytes_Resize(&repr, size) < 0)
4008 return NULL;
4009 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010}
4011
4012PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4013{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004014 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004016 PyErr_BadArgument();
4017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004019 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4020 PyUnicode_GET_SIZE(unicode));
4021
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004022 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023}
4024
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004025/* --- Unicode Internal Codec ------------------------------------------- */
4026
4027PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 Py_ssize_t size,
4029 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004030{
4031 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t startinpos;
4033 Py_ssize_t endinpos;
4034 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004035 PyUnicodeObject *v;
4036 Py_UNICODE *p;
4037 const char *end;
4038 const char *reason;
4039 PyObject *errorHandler = NULL;
4040 PyObject *exc = NULL;
4041
Neal Norwitzd43069c2006-01-08 01:12:10 +00004042#ifdef Py_UNICODE_WIDE
4043 Py_UNICODE unimax = PyUnicode_GetMax();
4044#endif
4045
Thomas Wouters89f507f2006-12-13 04:49:30 +00004046 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004047 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4048 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004050 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004052 p = PyUnicode_AS_UNICODE(v);
4053 end = s + size;
4054
4055 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004056 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004057 /* We have to sanity check the raw data, otherwise doom looms for
4058 some malformed UCS-4 data. */
4059 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004060#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004061 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004062#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004063 end-s < Py_UNICODE_SIZE
4064 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004066 startinpos = s - starts;
4067 if (end-s < Py_UNICODE_SIZE) {
4068 endinpos = end-starts;
4069 reason = "truncated input";
4070 }
4071 else {
4072 endinpos = s - starts + Py_UNICODE_SIZE;
4073 reason = "illegal code point (> 0x10FFFF)";
4074 }
4075 outpos = p - PyUnicode_AS_UNICODE(v);
4076 if (unicode_decode_call_errorhandler(
4077 errors, &errorHandler,
4078 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004079 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004080 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004081 goto onError;
4082 }
4083 }
4084 else {
4085 p++;
4086 s += Py_UNICODE_SIZE;
4087 }
4088 }
4089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004090 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004091 goto onError;
4092 Py_XDECREF(errorHandler);
4093 Py_XDECREF(exc);
4094 return (PyObject *)v;
4095
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004097 Py_XDECREF(v);
4098 Py_XDECREF(errorHandler);
4099 Py_XDECREF(exc);
4100 return NULL;
4101}
4102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103/* --- Latin-1 Codec ------------------------------------------------------ */
4104
4105PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 Py_ssize_t size,
4107 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108{
4109 PyUnicodeObject *v;
4110 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004111 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004114 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 Py_UNICODE r = *(unsigned char*)s;
4116 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004117 }
4118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 v = _PyUnicode_New(size);
4120 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004125 e = s + size;
4126 /* Unrolling the copy makes it much faster by reducing the looping
4127 overhead. This is similar to what many memcpy() implementations do. */
4128 unrolled_end = e - 4;
4129 while (s < unrolled_end) {
4130 p[0] = (unsigned char) s[0];
4131 p[1] = (unsigned char) s[1];
4132 p[2] = (unsigned char) s[2];
4133 p[3] = (unsigned char) s[3];
4134 s += 4;
4135 p += 4;
4136 }
4137 while (s < e)
4138 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004140
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 Py_XDECREF(v);
4143 return NULL;
4144}
4145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146/* create or adjust a UnicodeEncodeError */
4147static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 const char *encoding,
4149 const Py_UNICODE *unicode, Py_ssize_t size,
4150 Py_ssize_t startpos, Py_ssize_t endpos,
4151 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 *exceptionObject = PyUnicodeEncodeError_Create(
4155 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 }
4157 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4159 goto onError;
4160 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4161 goto onError;
4162 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4163 goto onError;
4164 return;
4165 onError:
4166 Py_DECREF(*exceptionObject);
4167 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 }
4169}
4170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171/* raises a UnicodeEncodeError */
4172static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 const char *encoding,
4174 const Py_UNICODE *unicode, Py_ssize_t size,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177{
4178 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182}
4183
4184/* error handling callback helper:
4185 build arguments, call the callback and check the arguments,
4186 put the result into newpos and return the replacement string, which
4187 has to be freed by the caller */
4188static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 PyObject **errorHandler,
4190 const char *encoding, const char *reason,
4191 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4192 Py_ssize_t startpos, Py_ssize_t endpos,
4193 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004195 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196
4197 PyObject *restuple;
4198 PyObject *resunicode;
4199
4200 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 }
4205
4206 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004209 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210
4211 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004214 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004216 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 Py_DECREF(restuple);
4218 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004220 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 &resunicode, newpos)) {
4222 Py_DECREF(restuple);
4223 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004225 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4226 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4227 Py_DECREF(restuple);
4228 return NULL;
4229 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004232 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4234 Py_DECREF(restuple);
4235 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_INCREF(resunicode);
4238 Py_DECREF(restuple);
4239 return resunicode;
4240}
4241
4242static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 Py_ssize_t size,
4244 const char *errors,
4245 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246{
4247 /* output object */
4248 PyObject *res;
4249 /* pointers to the beginning and end+1 of input */
4250 const Py_UNICODE *startp = p;
4251 const Py_UNICODE *endp = p + size;
4252 /* pointer to the beginning of the unencodable characters */
4253 /* const Py_UNICODE *badp = NULL; */
4254 /* pointer into the output */
4255 char *str;
4256 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004257 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004258 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4259 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 PyObject *errorHandler = NULL;
4261 PyObject *exc = NULL;
4262 /* the following variable is used for caching string comparisons
4263 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4264 int known_errorHandler = -1;
4265
4266 /* allocate enough for a simple encoding without
4267 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004268 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004269 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004270 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004272 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004273 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 ressize = size;
4275
4276 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 /* can we encode this? */
4280 if (c<limit) {
4281 /* no overflow check, because we know that the space is enough */
4282 *str++ = (char)c;
4283 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004284 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 else {
4286 Py_ssize_t unicodepos = p-startp;
4287 Py_ssize_t requiredsize;
4288 PyObject *repunicode;
4289 Py_ssize_t repsize;
4290 Py_ssize_t newpos;
4291 Py_ssize_t respos;
4292 Py_UNICODE *uni2;
4293 /* startpos for collecting unencodable chars */
4294 const Py_UNICODE *collstart = p;
4295 const Py_UNICODE *collend = p;
4296 /* find all unecodable characters */
4297 while ((collend < endp) && ((*collend)>=limit))
4298 ++collend;
4299 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4300 if (known_errorHandler==-1) {
4301 if ((errors==NULL) || (!strcmp(errors, "strict")))
4302 known_errorHandler = 1;
4303 else if (!strcmp(errors, "replace"))
4304 known_errorHandler = 2;
4305 else if (!strcmp(errors, "ignore"))
4306 known_errorHandler = 3;
4307 else if (!strcmp(errors, "xmlcharrefreplace"))
4308 known_errorHandler = 4;
4309 else
4310 known_errorHandler = 0;
4311 }
4312 switch (known_errorHandler) {
4313 case 1: /* strict */
4314 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4315 goto onError;
4316 case 2: /* replace */
4317 while (collstart++<collend)
4318 *str++ = '?'; /* fall through */
4319 case 3: /* ignore */
4320 p = collend;
4321 break;
4322 case 4: /* xmlcharrefreplace */
4323 respos = str - PyBytes_AS_STRING(res);
4324 /* determine replacement size (temporarily (mis)uses p) */
4325 for (p = collstart, repsize = 0; p < collend; ++p) {
4326 if (*p<10)
4327 repsize += 2+1+1;
4328 else if (*p<100)
4329 repsize += 2+2+1;
4330 else if (*p<1000)
4331 repsize += 2+3+1;
4332 else if (*p<10000)
4333 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004334#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 else
4336 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004337#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 else if (*p<100000)
4339 repsize += 2+5+1;
4340 else if (*p<1000000)
4341 repsize += 2+6+1;
4342 else
4343 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004344#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 }
4346 requiredsize = respos+repsize+(endp-collend);
4347 if (requiredsize > ressize) {
4348 if (requiredsize<2*ressize)
4349 requiredsize = 2*ressize;
4350 if (_PyBytes_Resize(&res, requiredsize))
4351 goto onError;
4352 str = PyBytes_AS_STRING(res) + respos;
4353 ressize = requiredsize;
4354 }
4355 /* generate replacement (temporarily (mis)uses p) */
4356 for (p = collstart; p < collend; ++p) {
4357 str += sprintf(str, "&#%d;", (int)*p);
4358 }
4359 p = collend;
4360 break;
4361 default:
4362 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4363 encoding, reason, startp, size, &exc,
4364 collstart-startp, collend-startp, &newpos);
4365 if (repunicode == NULL)
4366 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004367 if (PyBytes_Check(repunicode)) {
4368 /* Directly copy bytes result to output. */
4369 repsize = PyBytes_Size(repunicode);
4370 if (repsize > 1) {
4371 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004372 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004373 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4374 Py_DECREF(repunicode);
4375 goto onError;
4376 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004377 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004378 ressize += repsize-1;
4379 }
4380 memcpy(str, PyBytes_AsString(repunicode), repsize);
4381 str += repsize;
4382 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004383 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004384 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004385 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 /* need more space? (at least enough for what we
4387 have+the replacement+the rest of the string, so
4388 we won't have to check space for encodable characters) */
4389 respos = str - PyBytes_AS_STRING(res);
4390 repsize = PyUnicode_GET_SIZE(repunicode);
4391 requiredsize = respos+repsize+(endp-collend);
4392 if (requiredsize > ressize) {
4393 if (requiredsize<2*ressize)
4394 requiredsize = 2*ressize;
4395 if (_PyBytes_Resize(&res, requiredsize)) {
4396 Py_DECREF(repunicode);
4397 goto onError;
4398 }
4399 str = PyBytes_AS_STRING(res) + respos;
4400 ressize = requiredsize;
4401 }
4402 /* check if there is anything unencodable in the replacement
4403 and copy it to the output */
4404 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4405 c = *uni2;
4406 if (c >= limit) {
4407 raise_encode_exception(&exc, encoding, startp, size,
4408 unicodepos, unicodepos+1, reason);
4409 Py_DECREF(repunicode);
4410 goto onError;
4411 }
4412 *str = (char)c;
4413 }
4414 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004415 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004417 }
4418 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004419 /* Resize if we allocated to much */
4420 size = str - PyBytes_AS_STRING(res);
4421 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004422 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004423 if (_PyBytes_Resize(&res, size) < 0)
4424 goto onError;
4425 }
4426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 Py_XDECREF(errorHandler);
4428 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004429 return res;
4430
4431 onError:
4432 Py_XDECREF(res);
4433 Py_XDECREF(errorHandler);
4434 Py_XDECREF(exc);
4435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436}
4437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 Py_ssize_t size,
4440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443}
4444
4445PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4446{
4447 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 PyErr_BadArgument();
4449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 }
4451 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 PyUnicode_GET_SIZE(unicode),
4453 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454}
4455
4456/* --- 7-bit ASCII Codec -------------------------------------------------- */
4457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 Py_ssize_t size,
4460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 PyUnicodeObject *v;
4464 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t startinpos;
4466 Py_ssize_t endinpos;
4467 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 const char *e;
4469 PyObject *errorHandler = NULL;
4470 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004471
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004473 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 Py_UNICODE r = *(unsigned char*)s;
4475 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004476 }
Tim Petersced69f82003-09-16 20:30:58 +00004477
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 v = _PyUnicode_New(size);
4479 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 e = s + size;
4485 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 register unsigned char c = (unsigned char)*s;
4487 if (c < 128) {
4488 *p++ = c;
4489 ++s;
4490 }
4491 else {
4492 startinpos = s-starts;
4493 endinpos = startinpos + 1;
4494 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4495 if (unicode_decode_call_errorhandler(
4496 errors, &errorHandler,
4497 "ascii", "ordinal not in range(128)",
4498 &starts, &e, &startinpos, &endinpos, &exc, &s,
4499 &v, &outpos, &p))
4500 goto onError;
4501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004503 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4505 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 Py_XDECREF(errorHandler);
4507 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004509
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 Py_XDECREF(errorHandler);
4513 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 return NULL;
4515}
4516
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 Py_ssize_t size,
4519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522}
4523
4524PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4525{
4526 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 PyErr_BadArgument();
4528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 }
4530 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 PyUnicode_GET_SIZE(unicode),
4532 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533}
4534
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004535#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004536
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004537/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004538
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004539#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004540#define NEED_RETRY
4541#endif
4542
4543/* XXX This code is limited to "true" double-byte encodings, as
4544 a) it assumes an incomplete character consists of a single byte, and
4545 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547
4548static int is_dbcs_lead_byte(const char *s, int offset)
4549{
4550 const char *curr = s + offset;
4551
4552 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 const char *prev = CharPrev(s, curr);
4554 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555 }
4556 return 0;
4557}
4558
4559/*
4560 * Decode MBCS string into unicode object. If 'final' is set, converts
4561 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4562 */
4563static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 const char *s, /* MBCS string */
4565 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004566 int final,
4567 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004568{
4569 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004570 Py_ssize_t n;
4571 DWORD usize;
4572 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573
4574 assert(size >= 0);
4575
Victor Stinner554f3f02010-06-16 23:33:54 +00004576 /* check and handle 'errors' arg */
4577 if (errors==NULL || strcmp(errors, "strict")==0)
4578 flags = MB_ERR_INVALID_CHARS;
4579 else if (strcmp(errors, "ignore")==0)
4580 flags = 0;
4581 else {
4582 PyErr_Format(PyExc_ValueError,
4583 "mbcs encoding does not support errors='%s'",
4584 errors);
4585 return -1;
4586 }
4587
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004588 /* Skip trailing lead-byte unless 'final' is set */
4589 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004591
4592 /* First get the size of the result */
4593 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004594 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4595 if (usize==0)
4596 goto mbcs_decode_error;
4597 } else
4598 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599
4600 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 /* Create unicode object */
4602 *v = _PyUnicode_New(usize);
4603 if (*v == NULL)
4604 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004605 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004606 }
4607 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 /* Extend unicode object */
4609 n = PyUnicode_GET_SIZE(*v);
4610 if (_PyUnicode_Resize(v, n + usize) < 0)
4611 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612 }
4613
4614 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004615 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004617 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4618 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004621 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004622
4623mbcs_decode_error:
4624 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4625 we raise a UnicodeDecodeError - else it is a 'generic'
4626 windows error
4627 */
4628 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4629 /* Ideally, we should get reason from FormatMessage - this
4630 is the Windows 2000 English version of the message
4631 */
4632 PyObject *exc = NULL;
4633 const char *reason = "No mapping for the Unicode character exists "
4634 "in the target multi-byte code page.";
4635 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4636 if (exc != NULL) {
4637 PyCodec_StrictErrors(exc);
4638 Py_DECREF(exc);
4639 }
4640 } else {
4641 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4642 }
4643 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004644}
4645
4646PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 Py_ssize_t size,
4648 const char *errors,
4649 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004650{
4651 PyUnicodeObject *v = NULL;
4652 int done;
4653
4654 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004656
4657#ifdef NEED_RETRY
4658 retry:
4659 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004660 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 else
4662#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004663 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664
4665 if (done < 0) {
4666 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004668 }
4669
4670 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672
4673#ifdef NEED_RETRY
4674 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 s += done;
4676 size -= done;
4677 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004678 }
4679#endif
4680
4681 return (PyObject *)v;
4682}
4683
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004684PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 Py_ssize_t size,
4686 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004687{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004688 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4689}
4690
4691/*
4692 * Convert unicode into string object (MBCS).
4693 * Returns 0 if succeed, -1 otherwise.
4694 */
4695static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004697 int size, /* size of unicode */
4698 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004699{
Victor Stinner554f3f02010-06-16 23:33:54 +00004700 BOOL usedDefaultChar = FALSE;
4701 BOOL *pusedDefaultChar;
4702 int mbcssize;
4703 Py_ssize_t n;
4704 PyObject *exc = NULL;
4705 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706
4707 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004708
Victor Stinner554f3f02010-06-16 23:33:54 +00004709 /* check and handle 'errors' arg */
4710 if (errors==NULL || strcmp(errors, "strict")==0) {
4711 flags = WC_NO_BEST_FIT_CHARS;
4712 pusedDefaultChar = &usedDefaultChar;
4713 } else if (strcmp(errors, "replace")==0) {
4714 flags = 0;
4715 pusedDefaultChar = NULL;
4716 } else {
4717 PyErr_Format(PyExc_ValueError,
4718 "mbcs encoding does not support errors='%s'",
4719 errors);
4720 return -1;
4721 }
4722
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004723 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004724 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004725 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4726 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 if (mbcssize == 0) {
4728 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4729 return -1;
4730 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004731 /* If we used a default char, then we failed! */
4732 if (pusedDefaultChar && *pusedDefaultChar)
4733 goto mbcs_encode_error;
4734 } else {
4735 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004736 }
4737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004738 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 /* Create string object */
4740 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4741 if (*repr == NULL)
4742 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004743 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004744 }
4745 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 /* Extend string object */
4747 n = PyBytes_Size(*repr);
4748 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4749 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004750 }
4751
4752 /* Do the conversion */
4753 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004754 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004755 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4756 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4758 return -1;
4759 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004760 if (pusedDefaultChar && *pusedDefaultChar)
4761 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004762 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004763 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004764
4765mbcs_encode_error:
4766 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4767 Py_XDECREF(exc);
4768 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004769}
4770
4771PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 Py_ssize_t size,
4773 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004774{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004775 PyObject *repr = NULL;
4776 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004778#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004780 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004781 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004782 else
4783#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004784 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004785
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004786 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 Py_XDECREF(repr);
4788 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004789 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004790
4791#ifdef NEED_RETRY
4792 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 p += INT_MAX;
4794 size -= INT_MAX;
4795 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004796 }
4797#endif
4798
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004799 return repr;
4800}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004801
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004802PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4803{
4804 if (!PyUnicode_Check(unicode)) {
4805 PyErr_BadArgument();
4806 return NULL;
4807 }
4808 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 PyUnicode_GET_SIZE(unicode),
4810 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004811}
4812
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004813#undef NEED_RETRY
4814
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004815#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817/* --- Character Mapping Codec -------------------------------------------- */
4818
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 Py_ssize_t size,
4821 PyObject *mapping,
4822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 Py_ssize_t startinpos;
4826 Py_ssize_t endinpos;
4827 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 PyUnicodeObject *v;
4830 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004831 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 PyObject *errorHandler = NULL;
4833 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004834 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004835 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004836
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 /* Default to Latin-1 */
4838 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840
4841 v = _PyUnicode_New(size);
4842 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004848 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 mapstring = PyUnicode_AS_UNICODE(mapping);
4850 maplen = PyUnicode_GET_SIZE(mapping);
4851 while (s < e) {
4852 unsigned char ch = *s;
4853 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 if (ch < maplen)
4856 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 if (x == 0xfffe) {
4859 /* undefined mapping */
4860 outpos = p-PyUnicode_AS_UNICODE(v);
4861 startinpos = s-starts;
4862 endinpos = startinpos+1;
4863 if (unicode_decode_call_errorhandler(
4864 errors, &errorHandler,
4865 "charmap", "character maps to <undefined>",
4866 &starts, &e, &startinpos, &endinpos, &exc, &s,
4867 &v, &outpos, &p)) {
4868 goto onError;
4869 }
4870 continue;
4871 }
4872 *p++ = x;
4873 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004874 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004875 }
4876 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 while (s < e) {
4878 unsigned char ch = *s;
4879 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004880
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4882 w = PyLong_FromLong((long)ch);
4883 if (w == NULL)
4884 goto onError;
4885 x = PyObject_GetItem(mapping, w);
4886 Py_DECREF(w);
4887 if (x == NULL) {
4888 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4889 /* No mapping found means: mapping is undefined. */
4890 PyErr_Clear();
4891 x = Py_None;
4892 Py_INCREF(x);
4893 } else
4894 goto onError;
4895 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004896
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 /* Apply mapping */
4898 if (PyLong_Check(x)) {
4899 long value = PyLong_AS_LONG(x);
4900 if (value < 0 || value > 65535) {
4901 PyErr_SetString(PyExc_TypeError,
4902 "character mapping must be in range(65536)");
4903 Py_DECREF(x);
4904 goto onError;
4905 }
4906 *p++ = (Py_UNICODE)value;
4907 }
4908 else if (x == Py_None) {
4909 /* undefined mapping */
4910 outpos = p-PyUnicode_AS_UNICODE(v);
4911 startinpos = s-starts;
4912 endinpos = startinpos+1;
4913 if (unicode_decode_call_errorhandler(
4914 errors, &errorHandler,
4915 "charmap", "character maps to <undefined>",
4916 &starts, &e, &startinpos, &endinpos, &exc, &s,
4917 &v, &outpos, &p)) {
4918 Py_DECREF(x);
4919 goto onError;
4920 }
4921 Py_DECREF(x);
4922 continue;
4923 }
4924 else if (PyUnicode_Check(x)) {
4925 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004926
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 if (targetsize == 1)
4928 /* 1-1 mapping */
4929 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004930
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 else if (targetsize > 1) {
4932 /* 1-n mapping */
4933 if (targetsize > extrachars) {
4934 /* resize first */
4935 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4936 Py_ssize_t needed = (targetsize - extrachars) + \
4937 (targetsize << 2);
4938 extrachars += needed;
4939 /* XXX overflow detection missing */
4940 if (_PyUnicode_Resize(&v,
4941 PyUnicode_GET_SIZE(v) + needed) < 0) {
4942 Py_DECREF(x);
4943 goto onError;
4944 }
4945 p = PyUnicode_AS_UNICODE(v) + oldpos;
4946 }
4947 Py_UNICODE_COPY(p,
4948 PyUnicode_AS_UNICODE(x),
4949 targetsize);
4950 p += targetsize;
4951 extrachars -= targetsize;
4952 }
4953 /* 1-0 mapping: skip the character */
4954 }
4955 else {
4956 /* wrong return value */
4957 PyErr_SetString(PyExc_TypeError,
4958 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004959 Py_DECREF(x);
4960 goto onError;
4961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 Py_DECREF(x);
4963 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 }
4966 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4968 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 Py_XDECREF(errorHandler);
4970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004972
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004974 Py_XDECREF(errorHandler);
4975 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 Py_XDECREF(v);
4977 return NULL;
4978}
4979
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004980/* Charmap encoding: the lookup table */
4981
4982struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 PyObject_HEAD
4984 unsigned char level1[32];
4985 int count2, count3;
4986 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004987};
4988
4989static PyObject*
4990encoding_map_size(PyObject *obj, PyObject* args)
4991{
4992 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004993 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004995}
4996
4997static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004998 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 PyDoc_STR("Return the size (in bytes) of this object") },
5000 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005001};
5002
5003static void
5004encoding_map_dealloc(PyObject* o)
5005{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005006 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007}
5008
5009static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005010 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 "EncodingMap", /*tp_name*/
5012 sizeof(struct encoding_map), /*tp_basicsize*/
5013 0, /*tp_itemsize*/
5014 /* methods */
5015 encoding_map_dealloc, /*tp_dealloc*/
5016 0, /*tp_print*/
5017 0, /*tp_getattr*/
5018 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005019 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 0, /*tp_repr*/
5021 0, /*tp_as_number*/
5022 0, /*tp_as_sequence*/
5023 0, /*tp_as_mapping*/
5024 0, /*tp_hash*/
5025 0, /*tp_call*/
5026 0, /*tp_str*/
5027 0, /*tp_getattro*/
5028 0, /*tp_setattro*/
5029 0, /*tp_as_buffer*/
5030 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5031 0, /*tp_doc*/
5032 0, /*tp_traverse*/
5033 0, /*tp_clear*/
5034 0, /*tp_richcompare*/
5035 0, /*tp_weaklistoffset*/
5036 0, /*tp_iter*/
5037 0, /*tp_iternext*/
5038 encoding_map_methods, /*tp_methods*/
5039 0, /*tp_members*/
5040 0, /*tp_getset*/
5041 0, /*tp_base*/
5042 0, /*tp_dict*/
5043 0, /*tp_descr_get*/
5044 0, /*tp_descr_set*/
5045 0, /*tp_dictoffset*/
5046 0, /*tp_init*/
5047 0, /*tp_alloc*/
5048 0, /*tp_new*/
5049 0, /*tp_free*/
5050 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005051};
5052
5053PyObject*
5054PyUnicode_BuildEncodingMap(PyObject* string)
5055{
5056 Py_UNICODE *decode;
5057 PyObject *result;
5058 struct encoding_map *mresult;
5059 int i;
5060 int need_dict = 0;
5061 unsigned char level1[32];
5062 unsigned char level2[512];
5063 unsigned char *mlevel1, *mlevel2, *mlevel3;
5064 int count2 = 0, count3 = 0;
5065
5066 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5067 PyErr_BadArgument();
5068 return NULL;
5069 }
5070 decode = PyUnicode_AS_UNICODE(string);
5071 memset(level1, 0xFF, sizeof level1);
5072 memset(level2, 0xFF, sizeof level2);
5073
5074 /* If there isn't a one-to-one mapping of NULL to \0,
5075 or if there are non-BMP characters, we need to use
5076 a mapping dictionary. */
5077 if (decode[0] != 0)
5078 need_dict = 1;
5079 for (i = 1; i < 256; i++) {
5080 int l1, l2;
5081 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005082#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005083 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005084#endif
5085 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005086 need_dict = 1;
5087 break;
5088 }
5089 if (decode[i] == 0xFFFE)
5090 /* unmapped character */
5091 continue;
5092 l1 = decode[i] >> 11;
5093 l2 = decode[i] >> 7;
5094 if (level1[l1] == 0xFF)
5095 level1[l1] = count2++;
5096 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005097 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005098 }
5099
5100 if (count2 >= 0xFF || count3 >= 0xFF)
5101 need_dict = 1;
5102
5103 if (need_dict) {
5104 PyObject *result = PyDict_New();
5105 PyObject *key, *value;
5106 if (!result)
5107 return NULL;
5108 for (i = 0; i < 256; i++) {
5109 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005110 key = PyLong_FromLong(decode[i]);
5111 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005112 if (!key || !value)
5113 goto failed1;
5114 if (PyDict_SetItem(result, key, value) == -1)
5115 goto failed1;
5116 Py_DECREF(key);
5117 Py_DECREF(value);
5118 }
5119 return result;
5120 failed1:
5121 Py_XDECREF(key);
5122 Py_XDECREF(value);
5123 Py_DECREF(result);
5124 return NULL;
5125 }
5126
5127 /* Create a three-level trie */
5128 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5129 16*count2 + 128*count3 - 1);
5130 if (!result)
5131 return PyErr_NoMemory();
5132 PyObject_Init(result, &EncodingMapType);
5133 mresult = (struct encoding_map*)result;
5134 mresult->count2 = count2;
5135 mresult->count3 = count3;
5136 mlevel1 = mresult->level1;
5137 mlevel2 = mresult->level23;
5138 mlevel3 = mresult->level23 + 16*count2;
5139 memcpy(mlevel1, level1, 32);
5140 memset(mlevel2, 0xFF, 16*count2);
5141 memset(mlevel3, 0, 128*count3);
5142 count3 = 0;
5143 for (i = 1; i < 256; i++) {
5144 int o1, o2, o3, i2, i3;
5145 if (decode[i] == 0xFFFE)
5146 /* unmapped character */
5147 continue;
5148 o1 = decode[i]>>11;
5149 o2 = (decode[i]>>7) & 0xF;
5150 i2 = 16*mlevel1[o1] + o2;
5151 if (mlevel2[i2] == 0xFF)
5152 mlevel2[i2] = count3++;
5153 o3 = decode[i] & 0x7F;
5154 i3 = 128*mlevel2[i2] + o3;
5155 mlevel3[i3] = i;
5156 }
5157 return result;
5158}
5159
5160static int
5161encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5162{
5163 struct encoding_map *map = (struct encoding_map*)mapping;
5164 int l1 = c>>11;
5165 int l2 = (c>>7) & 0xF;
5166 int l3 = c & 0x7F;
5167 int i;
5168
5169#ifdef Py_UNICODE_WIDE
5170 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005172 }
5173#endif
5174 if (c == 0)
5175 return 0;
5176 /* level 1*/
5177 i = map->level1[l1];
5178 if (i == 0xFF) {
5179 return -1;
5180 }
5181 /* level 2*/
5182 i = map->level23[16*i+l2];
5183 if (i == 0xFF) {
5184 return -1;
5185 }
5186 /* level 3 */
5187 i = map->level23[16*map->count2 + 128*i + l3];
5188 if (i == 0) {
5189 return -1;
5190 }
5191 return i;
5192}
5193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194/* Lookup the character ch in the mapping. If the character
5195 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005196 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198{
Christian Heimes217cfd12007-12-02 14:31:20 +00005199 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 PyObject *x;
5201
5202 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 x = PyObject_GetItem(mapping, w);
5205 Py_DECREF(w);
5206 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5208 /* No mapping found means: mapping is undefined. */
5209 PyErr_Clear();
5210 x = Py_None;
5211 Py_INCREF(x);
5212 return x;
5213 } else
5214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005216 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005218 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 long value = PyLong_AS_LONG(x);
5220 if (value < 0 || value > 255) {
5221 PyErr_SetString(PyExc_TypeError,
5222 "character mapping must be in range(256)");
5223 Py_DECREF(x);
5224 return NULL;
5225 }
5226 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005228 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 /* wrong return value */
5232 PyErr_Format(PyExc_TypeError,
5233 "character mapping must return integer, bytes or None, not %.400s",
5234 x->ob_type->tp_name);
5235 Py_DECREF(x);
5236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 }
5238}
5239
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005240static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005241charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005242{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005243 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5244 /* exponentially overallocate to minimize reallocations */
5245 if (requiredsize < 2*outsize)
5246 requiredsize = 2*outsize;
5247 if (_PyBytes_Resize(outobj, requiredsize))
5248 return -1;
5249 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005250}
5251
Benjamin Peterson14339b62009-01-31 16:36:08 +00005252typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005254}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005256 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005257 space is available. Return a new reference to the object that
5258 was put in the output buffer, or Py_None, if the mapping was undefined
5259 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005260 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005261static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005262charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005265 PyObject *rep;
5266 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005267 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268
Christian Heimes90aa7642007-12-19 02:45:37 +00005269 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005270 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005272 if (res == -1)
5273 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 if (outsize<requiredsize)
5275 if (charmapencode_resize(outobj, outpos, requiredsize))
5276 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005277 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 outstart[(*outpos)++] = (char)res;
5279 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005280 }
5281
5282 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005285 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 Py_DECREF(rep);
5287 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 if (PyLong_Check(rep)) {
5290 Py_ssize_t requiredsize = *outpos+1;
5291 if (outsize<requiredsize)
5292 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5293 Py_DECREF(rep);
5294 return enc_EXCEPTION;
5295 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005296 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 else {
5300 const char *repchars = PyBytes_AS_STRING(rep);
5301 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5302 Py_ssize_t requiredsize = *outpos+repsize;
5303 if (outsize<requiredsize)
5304 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5305 Py_DECREF(rep);
5306 return enc_EXCEPTION;
5307 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005308 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 memcpy(outstart + *outpos, repchars, repsize);
5310 *outpos += repsize;
5311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005312 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313 Py_DECREF(rep);
5314 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005315}
5316
5317/* handle an error in PyUnicode_EncodeCharmap
5318 Return 0 on success, -1 on error */
5319static
5320int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005321 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005323 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005324 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325{
5326 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005327 Py_ssize_t repsize;
5328 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 Py_UNICODE *uni2;
5330 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t collstartpos = *inpos;
5332 Py_ssize_t collendpos = *inpos+1;
5333 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334 char *encoding = "charmap";
5335 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005336 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 /* find all unencodable characters */
5339 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005340 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005341 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 int res = encoding_map_lookup(p[collendpos], mapping);
5343 if (res != -1)
5344 break;
5345 ++collendpos;
5346 continue;
5347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 rep = charmapencode_lookup(p[collendpos], mapping);
5350 if (rep==NULL)
5351 return -1;
5352 else if (rep!=Py_None) {
5353 Py_DECREF(rep);
5354 break;
5355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005356 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 }
5359 /* cache callback name lookup
5360 * (if not done yet, i.e. it's the first error) */
5361 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 if ((errors==NULL) || (!strcmp(errors, "strict")))
5363 *known_errorHandler = 1;
5364 else if (!strcmp(errors, "replace"))
5365 *known_errorHandler = 2;
5366 else if (!strcmp(errors, "ignore"))
5367 *known_errorHandler = 3;
5368 else if (!strcmp(errors, "xmlcharrefreplace"))
5369 *known_errorHandler = 4;
5370 else
5371 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 }
5373 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374 case 1: /* strict */
5375 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5376 return -1;
5377 case 2: /* replace */
5378 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 x = charmapencode_output('?', mapping, res, respos);
5380 if (x==enc_EXCEPTION) {
5381 return -1;
5382 }
5383 else if (x==enc_FAILED) {
5384 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5385 return -1;
5386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005387 }
5388 /* fall through */
5389 case 3: /* ignore */
5390 *inpos = collendpos;
5391 break;
5392 case 4: /* xmlcharrefreplace */
5393 /* generate replacement (temporarily (mis)uses p) */
5394 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 char buffer[2+29+1+1];
5396 char *cp;
5397 sprintf(buffer, "&#%d;", (int)p[collpos]);
5398 for (cp = buffer; *cp; ++cp) {
5399 x = charmapencode_output(*cp, mapping, res, respos);
5400 if (x==enc_EXCEPTION)
5401 return -1;
5402 else if (x==enc_FAILED) {
5403 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5404 return -1;
5405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005406 }
5407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005408 *inpos = collendpos;
5409 break;
5410 default:
5411 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 encoding, reason, p, size, exceptionObject,
5413 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005414 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005416 if (PyBytes_Check(repunicode)) {
5417 /* Directly copy bytes result to output. */
5418 Py_ssize_t outsize = PyBytes_Size(*res);
5419 Py_ssize_t requiredsize;
5420 repsize = PyBytes_Size(repunicode);
5421 requiredsize = *respos + repsize;
5422 if (requiredsize > outsize)
5423 /* Make room for all additional bytes. */
5424 if (charmapencode_resize(res, respos, requiredsize)) {
5425 Py_DECREF(repunicode);
5426 return -1;
5427 }
5428 memcpy(PyBytes_AsString(*res) + *respos,
5429 PyBytes_AsString(repunicode), repsize);
5430 *respos += repsize;
5431 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005432 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005433 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005435 /* generate replacement */
5436 repsize = PyUnicode_GET_SIZE(repunicode);
5437 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 x = charmapencode_output(*uni2, mapping, res, respos);
5439 if (x==enc_EXCEPTION) {
5440 return -1;
5441 }
5442 else if (x==enc_FAILED) {
5443 Py_DECREF(repunicode);
5444 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5445 return -1;
5446 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005447 }
5448 *inpos = newpos;
5449 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 }
5451 return 0;
5452}
5453
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 Py_ssize_t size,
5456 PyObject *mapping,
5457 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 /* output object */
5460 PyObject *res = NULL;
5461 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 PyObject *errorHandler = NULL;
5466 PyObject *exc = NULL;
5467 /* the following variable is used for caching string comparisons
5468 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5469 * 3=ignore, 4=xmlcharrefreplace */
5470 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
5472 /* Default to Latin-1 */
5473 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 /* allocate enough for a simple encoding without
5477 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005478 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 if (res == NULL)
5480 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005481 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* try to encode it */
5486 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5487 if (x==enc_EXCEPTION) /* error */
5488 goto onError;
5489 if (x==enc_FAILED) { /* unencodable character */
5490 if (charmap_encoding_error(p, size, &inpos, mapping,
5491 &exc,
5492 &known_errorHandler, &errorHandler, errors,
5493 &res, &respos)) {
5494 goto onError;
5495 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 else
5498 /* done with this character => adjust input position */
5499 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005503 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005504 if (_PyBytes_Resize(&res, respos) < 0)
5505 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 Py_XDECREF(exc);
5508 Py_XDECREF(errorHandler);
5509 return res;
5510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 Py_XDECREF(res);
5513 Py_XDECREF(exc);
5514 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 return NULL;
5516}
5517
5518PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520{
5521 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 PyErr_BadArgument();
5523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 }
5525 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 PyUnicode_GET_SIZE(unicode),
5527 mapping,
5528 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529}
5530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531/* create or adjust a UnicodeTranslateError */
5532static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 const Py_UNICODE *unicode, Py_ssize_t size,
5534 Py_ssize_t startpos, Py_ssize_t endpos,
5535 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005538 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
5541 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5543 goto onError;
5544 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5545 goto onError;
5546 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5547 goto onError;
5548 return;
5549 onError:
5550 Py_DECREF(*exceptionObject);
5551 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 }
5553}
5554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555/* raises a UnicodeTranslateError */
5556static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 const Py_UNICODE *unicode, Py_ssize_t size,
5558 Py_ssize_t startpos, Py_ssize_t endpos,
5559 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560{
5561 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565}
5566
5567/* error handling callback helper:
5568 build arguments, call the callback and check the arguments,
5569 put the result into newpos and return the replacement string, which
5570 has to be freed by the caller */
5571static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 PyObject **errorHandler,
5573 const char *reason,
5574 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5575 Py_ssize_t startpos, Py_ssize_t endpos,
5576 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005578 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005580 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581 PyObject *restuple;
5582 PyObject *resunicode;
5583
5584 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 }
5589
5590 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594
5595 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005600 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 Py_DECREF(restuple);
5602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 }
5604 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 &resunicode, &i_newpos)) {
5606 Py_DECREF(restuple);
5607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005609 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 else
5612 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005613 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5615 Py_DECREF(restuple);
5616 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 Py_INCREF(resunicode);
5619 Py_DECREF(restuple);
5620 return resunicode;
5621}
5622
5623/* Lookup the character ch in the mapping and put the result in result,
5624 which must be decrefed by the caller.
5625 Return 0 on success, -1 on error */
5626static
5627int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5628{
Christian Heimes217cfd12007-12-02 14:31:20 +00005629 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 PyObject *x;
5631
5632 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 x = PyObject_GetItem(mapping, w);
5635 Py_DECREF(w);
5636 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5638 /* No mapping found means: use 1:1 mapping. */
5639 PyErr_Clear();
5640 *result = NULL;
5641 return 0;
5642 } else
5643 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 }
5645 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 *result = x;
5647 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005649 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 long value = PyLong_AS_LONG(x);
5651 long max = PyUnicode_GetMax();
5652 if (value < 0 || value > max) {
5653 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005654 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 Py_DECREF(x);
5656 return -1;
5657 }
5658 *result = x;
5659 return 0;
5660 }
5661 else if (PyUnicode_Check(x)) {
5662 *result = x;
5663 return 0;
5664 }
5665 else {
5666 /* wrong return value */
5667 PyErr_SetString(PyExc_TypeError,
5668 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005669 Py_DECREF(x);
5670 return -1;
5671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672}
5673/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 if not reallocate and adjust various state variables.
5675 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676static
Walter Dörwald4894c302003-10-24 14:25:28 +00005677int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005680 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005681 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* remember old output position */
5683 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5684 /* exponentially overallocate to minimize reallocations */
5685 if (requiredsize < 2 * oldsize)
5686 requiredsize = 2 * oldsize;
5687 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5688 return -1;
5689 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 }
5691 return 0;
5692}
5693/* lookup the character, put the result in the output string and adjust
5694 various state variables. Return a new reference to the object that
5695 was put in the output buffer in *result, or Py_None, if the mapping was
5696 undefined (in which case no character was written).
5697 The called must decref result.
5698 Return 0 on success, -1 on error. */
5699static
Walter Dörwald4894c302003-10-24 14:25:28 +00005700int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5702 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703{
Walter Dörwald4894c302003-10-24 14:25:28 +00005704 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 /* not found => default to 1:1 mapping */
5708 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 }
5710 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005712 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 /* no overflow check, because we know that the space is enough */
5714 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
5716 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5718 if (repsize==1) {
5719 /* no overflow check, because we know that the space is enough */
5720 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5721 }
5722 else if (repsize!=0) {
5723 /* more than one character */
5724 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5725 (insize - (curinp-startinp)) +
5726 repsize - 1;
5727 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5728 return -1;
5729 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5730 *outp += repsize;
5731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 }
5733 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 return 0;
5736}
5737
5738PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 Py_ssize_t size,
5740 PyObject *mapping,
5741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 /* output object */
5744 PyObject *res = NULL;
5745 /* pointers to the beginning and end+1 of input */
5746 const Py_UNICODE *startp = p;
5747 const Py_UNICODE *endp = p + size;
5748 /* pointer into the output */
5749 Py_UNICODE *str;
5750 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 char *reason = "character maps to <undefined>";
5753 PyObject *errorHandler = NULL;
5754 PyObject *exc = NULL;
5755 /* the following variable is used for caching string comparisons
5756 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5757 * 3=ignore, 4=xmlcharrefreplace */
5758 int known_errorHandler = -1;
5759
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 PyErr_BadArgument();
5762 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764
5765 /* allocate enough for a simple 1:1 translation without
5766 replacements, if we need more, we'll resize */
5767 res = PyUnicode_FromUnicode(NULL, size);
5768 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 /* try to encode it */
5776 PyObject *x = NULL;
5777 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5778 Py_XDECREF(x);
5779 goto onError;
5780 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005781 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 if (x!=Py_None) /* it worked => adjust input pointer */
5783 ++p;
5784 else { /* untranslatable character */
5785 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5786 Py_ssize_t repsize;
5787 Py_ssize_t newpos;
5788 Py_UNICODE *uni2;
5789 /* startpos for collecting untranslatable chars */
5790 const Py_UNICODE *collstart = p;
5791 const Py_UNICODE *collend = p+1;
5792 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 /* find all untranslatable characters */
5795 while (collend < endp) {
5796 if (charmaptranslate_lookup(*collend, mapping, &x))
5797 goto onError;
5798 Py_XDECREF(x);
5799 if (x!=Py_None)
5800 break;
5801 ++collend;
5802 }
5803 /* cache callback name lookup
5804 * (if not done yet, i.e. it's the first error) */
5805 if (known_errorHandler==-1) {
5806 if ((errors==NULL) || (!strcmp(errors, "strict")))
5807 known_errorHandler = 1;
5808 else if (!strcmp(errors, "replace"))
5809 known_errorHandler = 2;
5810 else if (!strcmp(errors, "ignore"))
5811 known_errorHandler = 3;
5812 else if (!strcmp(errors, "xmlcharrefreplace"))
5813 known_errorHandler = 4;
5814 else
5815 known_errorHandler = 0;
5816 }
5817 switch (known_errorHandler) {
5818 case 1: /* strict */
5819 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005820 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 case 2: /* replace */
5822 /* No need to check for space, this is a 1:1 replacement */
5823 for (coll = collstart; coll<collend; ++coll)
5824 *str++ = '?';
5825 /* fall through */
5826 case 3: /* ignore */
5827 p = collend;
5828 break;
5829 case 4: /* xmlcharrefreplace */
5830 /* generate replacement (temporarily (mis)uses p) */
5831 for (p = collstart; p < collend; ++p) {
5832 char buffer[2+29+1+1];
5833 char *cp;
5834 sprintf(buffer, "&#%d;", (int)*p);
5835 if (charmaptranslate_makespace(&res, &str,
5836 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5837 goto onError;
5838 for (cp = buffer; *cp; ++cp)
5839 *str++ = *cp;
5840 }
5841 p = collend;
5842 break;
5843 default:
5844 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5845 reason, startp, size, &exc,
5846 collstart-startp, collend-startp, &newpos);
5847 if (repunicode == NULL)
5848 goto onError;
5849 /* generate replacement */
5850 repsize = PyUnicode_GET_SIZE(repunicode);
5851 if (charmaptranslate_makespace(&res, &str,
5852 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5853 Py_DECREF(repunicode);
5854 goto onError;
5855 }
5856 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5857 *str++ = *uni2;
5858 p = startp + newpos;
5859 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005860 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005861 }
5862 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 /* Resize if we allocated to much */
5864 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005865 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 if (PyUnicode_Resize(&res, respos) < 0)
5867 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 }
5869 Py_XDECREF(exc);
5870 Py_XDECREF(errorHandler);
5871 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(res);
5875 Py_XDECREF(exc);
5876 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 return NULL;
5878}
5879
5880PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 PyObject *mapping,
5882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883{
5884 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 str = PyUnicode_FromObject(str);
5887 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 PyUnicode_GET_SIZE(str),
5891 mapping,
5892 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 Py_DECREF(str);
5894 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005895
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 Py_XDECREF(str);
5898 return NULL;
5899}
Tim Petersced69f82003-09-16 20:30:58 +00005900
Guido van Rossum9e896b32000-04-05 20:11:21 +00005901/* --- Decimal Encoder ---------------------------------------------------- */
5902
5903int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 Py_ssize_t length,
5905 char *output,
5906 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005907{
5908 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 PyObject *errorHandler = NULL;
5910 PyObject *exc = NULL;
5911 const char *encoding = "decimal";
5912 const char *reason = "invalid decimal Unicode string";
5913 /* the following variable is used for caching string comparisons
5914 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5915 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005916
5917 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 PyErr_BadArgument();
5919 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005920 }
5921
5922 p = s;
5923 end = s + length;
5924 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 register Py_UNICODE ch = *p;
5926 int decimal;
5927 PyObject *repunicode;
5928 Py_ssize_t repsize;
5929 Py_ssize_t newpos;
5930 Py_UNICODE *uni2;
5931 Py_UNICODE *collstart;
5932 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005935 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 ++p;
5937 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 decimal = Py_UNICODE_TODECIMAL(ch);
5940 if (decimal >= 0) {
5941 *output++ = '0' + decimal;
5942 ++p;
5943 continue;
5944 }
5945 if (0 < ch && ch < 256) {
5946 *output++ = (char)ch;
5947 ++p;
5948 continue;
5949 }
5950 /* All other characters are considered unencodable */
5951 collstart = p;
5952 collend = p+1;
5953 while (collend < end) {
5954 if ((0 < *collend && *collend < 256) ||
5955 !Py_UNICODE_ISSPACE(*collend) ||
5956 Py_UNICODE_TODECIMAL(*collend))
5957 break;
5958 }
5959 /* cache callback name lookup
5960 * (if not done yet, i.e. it's the first error) */
5961 if (known_errorHandler==-1) {
5962 if ((errors==NULL) || (!strcmp(errors, "strict")))
5963 known_errorHandler = 1;
5964 else if (!strcmp(errors, "replace"))
5965 known_errorHandler = 2;
5966 else if (!strcmp(errors, "ignore"))
5967 known_errorHandler = 3;
5968 else if (!strcmp(errors, "xmlcharrefreplace"))
5969 known_errorHandler = 4;
5970 else
5971 known_errorHandler = 0;
5972 }
5973 switch (known_errorHandler) {
5974 case 1: /* strict */
5975 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5976 goto onError;
5977 case 2: /* replace */
5978 for (p = collstart; p < collend; ++p)
5979 *output++ = '?';
5980 /* fall through */
5981 case 3: /* ignore */
5982 p = collend;
5983 break;
5984 case 4: /* xmlcharrefreplace */
5985 /* generate replacement (temporarily (mis)uses p) */
5986 for (p = collstart; p < collend; ++p)
5987 output += sprintf(output, "&#%d;", (int)*p);
5988 p = collend;
5989 break;
5990 default:
5991 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5992 encoding, reason, s, length, &exc,
5993 collstart-s, collend-s, &newpos);
5994 if (repunicode == NULL)
5995 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005996 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005997 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005998 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5999 Py_DECREF(repunicode);
6000 goto onError;
6001 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 /* generate replacement */
6003 repsize = PyUnicode_GET_SIZE(repunicode);
6004 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6005 Py_UNICODE ch = *uni2;
6006 if (Py_UNICODE_ISSPACE(ch))
6007 *output++ = ' ';
6008 else {
6009 decimal = Py_UNICODE_TODECIMAL(ch);
6010 if (decimal >= 0)
6011 *output++ = '0' + decimal;
6012 else if (0 < ch && ch < 256)
6013 *output++ = (char)ch;
6014 else {
6015 Py_DECREF(repunicode);
6016 raise_encode_exception(&exc, encoding,
6017 s, length, collstart-s, collend-s, reason);
6018 goto onError;
6019 }
6020 }
6021 }
6022 p = s + newpos;
6023 Py_DECREF(repunicode);
6024 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006025 }
6026 /* 0-terminate the output string */
6027 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 Py_XDECREF(exc);
6029 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006030 return 0;
6031
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(exc);
6034 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006035 return -1;
6036}
6037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038/* --- Helpers ------------------------------------------------------------ */
6039
Eric Smith8c663262007-08-25 02:26:07 +00006040#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006042
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043#include "stringlib/count.h"
6044#include "stringlib/find.h"
6045#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006046#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006047
Eric Smith5807c412008-05-11 21:00:57 +00006048#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006049#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006050#include "stringlib/localeutil.h"
6051
Thomas Wouters477c8d52006-05-27 19:21:47 +00006052/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006053#define ADJUST_INDICES(start, end, len) \
6054 if (end > len) \
6055 end = len; \
6056 else if (end < 0) { \
6057 end += len; \
6058 if (end < 0) \
6059 end = 0; \
6060 } \
6061 if (start < 0) { \
6062 start += len; \
6063 if (start < 0) \
6064 start = 0; \
6065 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006066
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006068 PyObject *substr,
6069 Py_ssize_t start,
6070 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006073 PyUnicodeObject* str_obj;
6074 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006075
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6077 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006079 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6080 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 Py_DECREF(str_obj);
6082 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 }
Tim Petersced69f82003-09-16 20:30:58 +00006084
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006085 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006086 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006087 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6088 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006089 );
6090
6091 Py_DECREF(sub_obj);
6092 Py_DECREF(str_obj);
6093
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 return result;
6095}
6096
Martin v. Löwis18e16552006-02-15 17:27:45 +00006097Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006098 PyObject *sub,
6099 Py_ssize_t start,
6100 Py_ssize_t end,
6101 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108 sub = PyUnicode_FromObject(sub);
6109 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 Py_DECREF(str);
6111 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Tim Petersced69f82003-09-16 20:30:58 +00006113
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 if (direction > 0)
6115 result = stringlib_find_slice(
6116 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6117 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6118 start, end
6119 );
6120 else
6121 result = stringlib_rfind_slice(
6122 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6123 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6124 start, end
6125 );
6126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006128 Py_DECREF(sub);
6129
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 return result;
6131}
6132
Tim Petersced69f82003-09-16 20:30:58 +00006133static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 PyUnicodeObject *substring,
6136 Py_ssize_t start,
6137 Py_ssize_t end,
6138 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 if (substring->length == 0)
6141 return 1;
6142
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006143 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 end -= substring->length;
6145 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
6148 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 if (Py_UNICODE_MATCH(self, end, substring))
6150 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 } else {
6152 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
6155
6156 return 0;
6157}
6158
Martin v. Löwis18e16552006-02-15 17:27:45 +00006159Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 PyObject *substr,
6161 Py_ssize_t start,
6162 Py_ssize_t end,
6163 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006165 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006166
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 str = PyUnicode_FromObject(str);
6168 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 substr = PyUnicode_FromObject(substr);
6171 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 Py_DECREF(str);
6173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 }
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 (PyUnicodeObject *)substr,
6178 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 Py_DECREF(str);
6180 Py_DECREF(substr);
6181 return result;
6182}
6183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184/* Apply fixfct filter to the Unicode object self and return a
6185 reference to the modified object */
6186
Tim Petersced69f82003-09-16 20:30:58 +00006187static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
6191
6192 PyUnicodeObject *u;
6193
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006194 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006197
6198 Py_UNICODE_COPY(u->str, self->str, self->length);
6199
Tim Peters7a29bd52001-09-12 03:03:31 +00006200 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* fixfct should return TRUE if it modified the buffer. If
6202 FALSE, return a reference to the original buffer instead
6203 (to save space, not time) */
6204 Py_INCREF(self);
6205 Py_DECREF(u);
6206 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 }
6208 return (PyObject*) u;
6209}
6210
Tim Petersced69f82003-09-16 20:30:58 +00006211static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212int fixupper(PyUnicodeObject *self)
6213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006214 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 Py_UNICODE *s = self->str;
6216 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006217
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006220
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 ch = Py_UNICODE_TOUPPER(*s);
6222 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 *s = ch;
6225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 s++;
6227 }
6228
6229 return status;
6230}
6231
Tim Petersced69f82003-09-16 20:30:58 +00006232static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233int fixlower(PyUnicodeObject *self)
6234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006235 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 Py_UNICODE *s = self->str;
6237 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006238
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 ch = Py_UNICODE_TOLOWER(*s);
6243 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 *s = ch;
6246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 s++;
6248 }
6249
6250 return status;
6251}
6252
Tim Petersced69f82003-09-16 20:30:58 +00006253static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254int fixswapcase(PyUnicodeObject *self)
6255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006256 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 Py_UNICODE *s = self->str;
6258 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006259
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 while (len-- > 0) {
6261 if (Py_UNICODE_ISUPPER(*s)) {
6262 *s = Py_UNICODE_TOLOWER(*s);
6263 status = 1;
6264 } else if (Py_UNICODE_ISLOWER(*s)) {
6265 *s = Py_UNICODE_TOUPPER(*s);
6266 status = 1;
6267 }
6268 s++;
6269 }
6270
6271 return status;
6272}
6273
Tim Petersced69f82003-09-16 20:30:58 +00006274static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275int fixcapitalize(PyUnicodeObject *self)
6276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006278 Py_UNICODE *s = self->str;
6279 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006280
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006281 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006283 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 *s = Py_UNICODE_TOUPPER(*s);
6285 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006287 s++;
6288 while (--len > 0) {
6289 if (Py_UNICODE_ISUPPER(*s)) {
6290 *s = Py_UNICODE_TOLOWER(*s);
6291 status = 1;
6292 }
6293 s++;
6294 }
6295 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296}
6297
6298static
6299int fixtitle(PyUnicodeObject *self)
6300{
6301 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6302 register Py_UNICODE *e;
6303 int previous_is_cased;
6304
6305 /* Shortcut for single character strings */
6306 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6308 if (*p != ch) {
6309 *p = ch;
6310 return 1;
6311 }
6312 else
6313 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 }
Tim Petersced69f82003-09-16 20:30:58 +00006315
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 e = p + PyUnicode_GET_SIZE(self);
6317 previous_is_cased = 0;
6318 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 if (previous_is_cased)
6322 *p = Py_UNICODE_TOLOWER(ch);
6323 else
6324 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006325
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 if (Py_UNICODE_ISLOWER(ch) ||
6327 Py_UNICODE_ISUPPER(ch) ||
6328 Py_UNICODE_ISTITLE(ch))
6329 previous_is_cased = 1;
6330 else
6331 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 }
6333 return 1;
6334}
6335
Tim Peters8ce9f162004-08-27 01:49:32 +00006336PyObject *
6337PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
Skip Montanaro6543b452004-09-16 03:28:13 +00006339 const Py_UNICODE blank = ' ';
6340 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006341 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006342 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006343 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6344 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006345 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6346 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006347 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006348 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Tim Peters05eba1f2004-08-27 21:32:02 +00006350 fseq = PySequence_Fast(seq, "");
6351 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006352 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006353 }
6354
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006355 /* NOTE: the following code can't call back into Python code,
6356 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006357 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006358
Tim Peters05eba1f2004-08-27 21:32:02 +00006359 seqlen = PySequence_Fast_GET_SIZE(fseq);
6360 /* If empty sequence, return u"". */
6361 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006362 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6363 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006364 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006365 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006366 /* If singleton sequence with an exact Unicode, return that. */
6367 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 item = items[0];
6369 if (PyUnicode_CheckExact(item)) {
6370 Py_INCREF(item);
6371 res = (PyUnicodeObject *)item;
6372 goto Done;
6373 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006374 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006375 else {
6376 /* Set up sep and seplen */
6377 if (separator == NULL) {
6378 sep = &blank;
6379 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006380 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006381 else {
6382 if (!PyUnicode_Check(separator)) {
6383 PyErr_Format(PyExc_TypeError,
6384 "separator: expected str instance,"
6385 " %.80s found",
6386 Py_TYPE(separator)->tp_name);
6387 goto onError;
6388 }
6389 sep = PyUnicode_AS_UNICODE(separator);
6390 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006391 }
6392 }
6393
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006394 /* There are at least two things to join, or else we have a subclass
6395 * of str in the sequence.
6396 * Do a pre-pass to figure out the total amount of space we'll
6397 * need (sz), and see whether all argument are strings.
6398 */
6399 sz = 0;
6400 for (i = 0; i < seqlen; i++) {
6401 const Py_ssize_t old_sz = sz;
6402 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 if (!PyUnicode_Check(item)) {
6404 PyErr_Format(PyExc_TypeError,
6405 "sequence item %zd: expected str instance,"
6406 " %.80s found",
6407 i, Py_TYPE(item)->tp_name);
6408 goto onError;
6409 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006410 sz += PyUnicode_GET_SIZE(item);
6411 if (i != 0)
6412 sz += seplen;
6413 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6414 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006416 goto onError;
6417 }
6418 }
Tim Petersced69f82003-09-16 20:30:58 +00006419
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006420 res = _PyUnicode_New(sz);
6421 if (res == NULL)
6422 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006423
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006424 /* Catenate everything. */
6425 res_p = PyUnicode_AS_UNICODE(res);
6426 for (i = 0; i < seqlen; ++i) {
6427 Py_ssize_t itemlen;
6428 item = items[i];
6429 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 /* Copy item, and maybe the separator. */
6431 if (i) {
6432 Py_UNICODE_COPY(res_p, sep, seplen);
6433 res_p += seplen;
6434 }
6435 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6436 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006437 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006438
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006440 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 return (PyObject *)res;
6442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006444 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006445 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return NULL;
6447}
6448
Tim Petersced69f82003-09-16 20:30:58 +00006449static
6450PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 Py_ssize_t left,
6452 Py_ssize_t right,
6453 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454{
6455 PyUnicodeObject *u;
6456
6457 if (left < 0)
6458 left = 0;
6459 if (right < 0)
6460 right = 0;
6461
Tim Peters7a29bd52001-09-12 03:03:31 +00006462 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 Py_INCREF(self);
6464 return self;
6465 }
6466
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006467 if (left > PY_SSIZE_T_MAX - self->length ||
6468 right > PY_SSIZE_T_MAX - (left + self->length)) {
6469 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6470 return NULL;
6471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 u = _PyUnicode_New(left + self->length + right);
6473 if (u) {
6474 if (left)
6475 Py_UNICODE_FILL(u->str, fill, left);
6476 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6477 if (right)
6478 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6479 }
6480
6481 return u;
6482}
6483
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006484PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
6488 string = PyUnicode_FromObject(string);
6489 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492 list = stringlib_splitlines(
6493 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6494 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495
6496 Py_DECREF(string);
6497 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Tim Petersced69f82003-09-16 20:30:58 +00006500static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 PyUnicodeObject *substring,
6503 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006506 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006509 return stringlib_split_whitespace(
6510 (PyObject*) self, self->str, self->length, maxcount
6511 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006513 return stringlib_split(
6514 (PyObject*) self, self->str, self->length,
6515 substring->str, substring->length,
6516 maxcount
6517 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518}
6519
Tim Petersced69f82003-09-16 20:30:58 +00006520static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006521PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 PyUnicodeObject *substring,
6523 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006524{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006525 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006526 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006527
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006528 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006529 return stringlib_rsplit_whitespace(
6530 (PyObject*) self, self->str, self->length, maxcount
6531 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006532
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006533 return stringlib_rsplit(
6534 (PyObject*) self, self->str, self->length,
6535 substring->str, substring->length,
6536 maxcount
6537 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006538}
6539
6540static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 PyUnicodeObject *str1,
6543 PyUnicodeObject *str2,
6544 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
6546 PyUnicodeObject *u;
6547
6548 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006550 else if (maxcount == 0 || self->length == 0)
6551 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006554 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006555 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006556 if (str1->length == 0)
6557 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006558 if (str1->length == 1) {
6559 /* replace characters */
6560 Py_UNICODE u1, u2;
6561 if (!findchar(self->str, self->length, str1->str[0]))
6562 goto nothing;
6563 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6564 if (!u)
6565 return NULL;
6566 Py_UNICODE_COPY(u->str, self->str, self->length);
6567 u1 = str1->str[0];
6568 u2 = str2->str[0];
6569 for (i = 0; i < u->length; i++)
6570 if (u->str[i] == u1) {
6571 if (--maxcount < 0)
6572 break;
6573 u->str[i] = u2;
6574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006576 i = stringlib_find(
6577 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 if (i < 0)
6580 goto nothing;
6581 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6582 if (!u)
6583 return NULL;
6584 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006585
6586 /* change everything in-place, starting with this one */
6587 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6588 i += str1->length;
6589
6590 while ( --maxcount > 0) {
6591 i = stringlib_find(self->str+i, self->length-i,
6592 str1->str, str1->length,
6593 i);
6594 if (i == -1)
6595 break;
6596 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6597 i += str1->length;
6598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006601
6602 Py_ssize_t n, i, j, e;
6603 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 Py_UNICODE *p;
6605
6606 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006607 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6608 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006609 if (n == 0)
6610 goto nothing;
6611 /* new_size = self->length + n * (str2->length - str1->length)); */
6612 delta = (str2->length - str1->length);
6613 if (delta == 0) {
6614 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 product = n * (str2->length - str1->length);
6617 if ((product / (str2->length - str1->length)) != n) {
6618 PyErr_SetString(PyExc_OverflowError,
6619 "replace string is too long");
6620 return NULL;
6621 }
6622 new_size = self->length + product;
6623 if (new_size < 0) {
6624 PyErr_SetString(PyExc_OverflowError,
6625 "replace string is too long");
6626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
6628 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006629 u = _PyUnicode_New(new_size);
6630 if (!u)
6631 return NULL;
6632 i = 0;
6633 p = u->str;
6634 e = self->length - str1->length;
6635 if (str1->length > 0) {
6636 while (n-- > 0) {
6637 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006638 j = stringlib_find(self->str+i, self->length-i,
6639 str1->str, str1->length,
6640 i);
6641 if (j == -1)
6642 break;
6643 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006644 /* copy unchanged part [i:j] */
6645 Py_UNICODE_COPY(p, self->str+i, j-i);
6646 p += j - i;
6647 }
6648 /* copy substitution string */
6649 if (str2->length > 0) {
6650 Py_UNICODE_COPY(p, str2->str, str2->length);
6651 p += str2->length;
6652 }
6653 i = j + str1->length;
6654 }
6655 if (i < self->length)
6656 /* copy tail [i:] */
6657 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6658 } else {
6659 /* interleave */
6660 while (n > 0) {
6661 Py_UNICODE_COPY(p, str2->str, str2->length);
6662 p += str2->length;
6663 if (--n <= 0)
6664 break;
6665 *p++ = self->str[i++];
6666 }
6667 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006671
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673 /* nothing to replace; return original string (when possible) */
6674 if (PyUnicode_CheckExact(self)) {
6675 Py_INCREF(self);
6676 return (PyObject *) self;
6677 }
6678 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
6681/* --- Unicode Object Methods --------------------------------------------- */
6682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685\n\
6686Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006687characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006690unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 return fixup(self, fixtitle);
6693}
6694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006695PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697\n\
6698Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006699have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700
6701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006702unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return fixup(self, fixcapitalize);
6705}
6706
6707#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710\n\
6711Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006715unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
6717 PyObject *list;
6718 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006719 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 /* Split into words */
6722 list = split(self, NULL, -1);
6723 if (!list)
6724 return NULL;
6725
6726 /* Capitalize each word */
6727 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6728 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 if (item == NULL)
6731 goto onError;
6732 Py_DECREF(PyList_GET_ITEM(list, i));
6733 PyList_SET_ITEM(list, i, item);
6734 }
6735
6736 /* Join the words to form a new string */
6737 item = PyUnicode_Join(NULL, list);
6738
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 Py_DECREF(list);
6741 return (PyObject *)item;
6742}
6743#endif
6744
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006745/* Argument converter. Coerces to a single unicode character */
6746
6747static int
6748convert_uc(PyObject *obj, void *addr)
6749{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006750 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6751 PyObject *uniobj;
6752 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006753
Benjamin Peterson14339b62009-01-31 16:36:08 +00006754 uniobj = PyUnicode_FromObject(obj);
6755 if (uniobj == NULL) {
6756 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006758 return 0;
6759 }
6760 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6761 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006763 Py_DECREF(uniobj);
6764 return 0;
6765 }
6766 unistr = PyUnicode_AS_UNICODE(uniobj);
6767 *fillcharloc = unistr[0];
6768 Py_DECREF(uniobj);
6769 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006770}
6771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006772PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006775Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006776done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
6778static PyObject *
6779unicode_center(PyUnicodeObject *self, PyObject *args)
6780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 Py_ssize_t marg, left;
6782 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006783 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
Thomas Woutersde017742006-02-16 19:34:37 +00006785 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 return NULL;
6787
Tim Peters7a29bd52001-09-12 03:03:31 +00006788 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 Py_INCREF(self);
6790 return (PyObject*) self;
6791 }
6792
6793 marg = width - self->length;
6794 left = marg / 2 + (marg & width & 1);
6795
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006796 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797}
6798
Marc-André Lemburge5034372000-08-08 08:04:29 +00006799#if 0
6800
6801/* This code should go into some future Unicode collation support
6802 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006803 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006804
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006805/* speedy UTF-16 code point order comparison */
6806/* gleaned from: */
6807/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6808
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006809static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006810{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006811 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006812 0, 0, 0, 0, 0, 0, 0, 0,
6813 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006814 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006815};
6816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817static int
6818unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006820 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 Py_UNICODE *s1 = str1->str;
6823 Py_UNICODE *s2 = str2->str;
6824
6825 len1 = str1->length;
6826 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006829 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006830
6831 c1 = *s1++;
6832 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006833
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 if (c1 > (1<<11) * 26)
6835 c1 += utf16Fixup[c1>>11];
6836 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006837 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006838 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006839
6840 if (c1 != c2)
6841 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006842
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006843 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
6845
6846 return (len1 < len2) ? -1 : (len1 != len2);
6847}
6848
Marc-André Lemburge5034372000-08-08 08:04:29 +00006849#else
6850
6851static int
6852unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006855
6856 Py_UNICODE *s1 = str1->str;
6857 Py_UNICODE *s2 = str2->str;
6858
6859 len1 = str1->length;
6860 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006861
Marc-André Lemburge5034372000-08-08 08:04:29 +00006862 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006863 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006864
Fredrik Lundh45714e92001-06-26 16:39:36 +00006865 c1 = *s1++;
6866 c2 = *s2++;
6867
6868 if (c1 != c2)
6869 return (c1 < c2) ? -1 : 1;
6870
Marc-André Lemburge5034372000-08-08 08:04:29 +00006871 len1--; len2--;
6872 }
6873
6874 return (len1 < len2) ? -1 : (len1 != len2);
6875}
6876
6877#endif
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006882 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6883 return unicode_compare((PyUnicodeObject *)left,
6884 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006885 PyErr_Format(PyExc_TypeError,
6886 "Can't compare %.100s and %.100s",
6887 left->ob_type->tp_name,
6888 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 return -1;
6890}
6891
Martin v. Löwis5b222132007-06-10 09:51:05 +00006892int
6893PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6894{
6895 int i;
6896 Py_UNICODE *id;
6897 assert(PyUnicode_Check(uni));
6898 id = PyUnicode_AS_UNICODE(uni);
6899 /* Compare Unicode string and source character set string */
6900 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 if (id[i] != str[i])
6902 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006903 /* This check keeps Python strings that end in '\0' from comparing equal
6904 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006905 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006907 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006909 return 0;
6910}
6911
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006912
Benjamin Peterson29060642009-01-31 22:14:21 +00006913#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006914 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006915
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006916PyObject *PyUnicode_RichCompare(PyObject *left,
6917 PyObject *right,
6918 int op)
6919{
6920 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006921
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006922 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6923 PyObject *v;
6924 if (((PyUnicodeObject *) left)->length !=
6925 ((PyUnicodeObject *) right)->length) {
6926 if (op == Py_EQ) {
6927 Py_INCREF(Py_False);
6928 return Py_False;
6929 }
6930 if (op == Py_NE) {
6931 Py_INCREF(Py_True);
6932 return Py_True;
6933 }
6934 }
6935 if (left == right)
6936 result = 0;
6937 else
6938 result = unicode_compare((PyUnicodeObject *)left,
6939 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006941 /* Convert the return value to a Boolean */
6942 switch (op) {
6943 case Py_EQ:
6944 v = TEST_COND(result == 0);
6945 break;
6946 case Py_NE:
6947 v = TEST_COND(result != 0);
6948 break;
6949 case Py_LE:
6950 v = TEST_COND(result <= 0);
6951 break;
6952 case Py_GE:
6953 v = TEST_COND(result >= 0);
6954 break;
6955 case Py_LT:
6956 v = TEST_COND(result == -1);
6957 break;
6958 case Py_GT:
6959 v = TEST_COND(result == 1);
6960 break;
6961 default:
6962 PyErr_BadArgument();
6963 return NULL;
6964 }
6965 Py_INCREF(v);
6966 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006967 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006969 Py_INCREF(Py_NotImplemented);
6970 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006971}
6972
Guido van Rossum403d68b2000-03-13 15:55:09 +00006973int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006975{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006976 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006977 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006978
6979 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 sub = PyUnicode_FromObject(element);
6981 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 PyErr_Format(PyExc_TypeError,
6983 "'in <string>' requires string as left operand, not %s",
6984 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006986 }
6987
Thomas Wouters477c8d52006-05-27 19:21:47 +00006988 str = PyUnicode_FromObject(container);
6989 if (!str) {
6990 Py_DECREF(sub);
6991 return -1;
6992 }
6993
6994 result = stringlib_contains_obj(str, sub);
6995
6996 Py_DECREF(str);
6997 Py_DECREF(sub);
6998
Guido van Rossum403d68b2000-03-13 15:55:09 +00006999 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007000}
7001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002/* Concat to string or Unicode object giving a new Unicode object. */
7003
7004PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006{
7007 PyUnicodeObject *u = NULL, *v = NULL, *w;
7008
7009 /* Coerce the two arguments */
7010 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7011 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7014 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016
7017 /* Shortcuts */
7018 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 Py_DECREF(v);
7020 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
7022 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 Py_DECREF(u);
7024 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 }
7026
7027 /* Concat the two Unicode strings */
7028 w = _PyUnicode_New(u->length + v->length);
7029 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 Py_UNICODE_COPY(w->str, u->str, u->length);
7032 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7033
7034 Py_DECREF(u);
7035 Py_DECREF(v);
7036 return (PyObject *)w;
7037
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 Py_XDECREF(u);
7040 Py_XDECREF(v);
7041 return NULL;
7042}
7043
Walter Dörwald1ab83302007-05-18 17:15:44 +00007044void
7045PyUnicode_Append(PyObject **pleft, PyObject *right)
7046{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007047 PyObject *new;
7048 if (*pleft == NULL)
7049 return;
7050 if (right == NULL || !PyUnicode_Check(*pleft)) {
7051 Py_DECREF(*pleft);
7052 *pleft = NULL;
7053 return;
7054 }
7055 new = PyUnicode_Concat(*pleft, right);
7056 Py_DECREF(*pleft);
7057 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007058}
7059
7060void
7061PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7062{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007063 PyUnicode_Append(pleft, right);
7064 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007065}
7066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007070Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007071string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
7074static PyObject *
7075unicode_count(PyUnicodeObject *self, PyObject *args)
7076{
7077 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007078 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007079 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 PyObject *result;
7081
Guido van Rossumb8872e62000-05-09 14:14:27 +00007082 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 return NULL;
7085
7086 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007087 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007090
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007091 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007092 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007093 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007094 substring->str, substring->length,
7095 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097
7098 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007099
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 return result;
7101}
7102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007103PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007106Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007107to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007108handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7110'xmlcharrefreplace' as well as any other name registered with\n\
7111codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007114unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007116 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 char *encoding = NULL;
7118 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007119 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007120
Benjamin Peterson308d6372009-09-18 21:42:35 +00007121 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7122 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007124 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007125 if (v == NULL)
7126 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007127 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007128 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007129 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007130 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007131 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007132 Py_DECREF(v);
7133 return NULL;
7134 }
7135 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007136
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007138 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
7144Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject*
7148unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7149{
7150 Py_UNICODE *e;
7151 Py_UNICODE *p;
7152 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007153 Py_UNICODE *qe;
7154 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 PyUnicodeObject *u;
7156 int tabsize = 8;
7157
7158 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Thomas Wouters7e474022000-07-16 12:04:32 +00007161 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007162 i = 0; /* chars up to and including most recent \n or \r */
7163 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7164 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 for (p = self->str; p < e; p++)
7166 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 if (tabsize > 0) {
7168 incr = tabsize - (j % tabsize); /* cannot overflow */
7169 if (j > PY_SSIZE_T_MAX - incr)
7170 goto overflow1;
7171 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 if (j > PY_SSIZE_T_MAX - 1)
7176 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 j++;
7178 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 if (i > PY_SSIZE_T_MAX - j)
7180 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007182 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 }
7184 }
7185
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007186 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 /* Second pass: create output string and fill it */
7190 u = _PyUnicode_New(i + j);
7191 if (!u)
7192 return NULL;
7193
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007194 j = 0; /* same as in first pass */
7195 q = u->str; /* next output char */
7196 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198 for (p = self->str; p < e; p++)
7199 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 if (tabsize > 0) {
7201 i = tabsize - (j % tabsize);
7202 j += i;
7203 while (i--) {
7204 if (q >= qe)
7205 goto overflow2;
7206 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007207 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 else {
7211 if (q >= qe)
7212 goto overflow2;
7213 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007214 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 if (*p == '\n' || *p == '\r')
7216 j = 0;
7217 }
7218
7219 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007220
7221 overflow2:
7222 Py_DECREF(u);
7223 overflow1:
7224 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007228PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230\n\
7231Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007232such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233arguments start and end are interpreted as in slice notation.\n\
7234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject *
7238unicode_find(PyUnicodeObject *self, PyObject *args)
7239{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007240 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007241 Py_ssize_t start;
7242 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007243 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Christian Heimes9cd17752007-11-18 19:35:23 +00007245 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
Thomas Wouters477c8d52006-05-27 19:21:47 +00007248 result = stringlib_find_slice(
7249 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7250 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7251 start, end
7252 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
7254 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007255
Christian Heimes217cfd12007-12-02 14:31:20 +00007256 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257}
7258
7259static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007260unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261{
7262 if (index < 0 || index >= self->length) {
7263 PyErr_SetString(PyExc_IndexError, "string index out of range");
7264 return NULL;
7265 }
7266
7267 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7268}
7269
Guido van Rossumc2504932007-09-18 19:42:40 +00007270/* Believe it or not, this produces the same value for ASCII strings
7271 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007273unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274{
Guido van Rossumc2504932007-09-18 19:42:40 +00007275 Py_ssize_t len;
7276 Py_UNICODE *p;
7277 long x;
7278
7279 if (self->hash != -1)
7280 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007281 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007282 p = self->str;
7283 x = *p << 7;
7284 while (--len >= 0)
7285 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007286 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007287 if (x == -1)
7288 x = -2;
7289 self->hash = x;
7290 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291}
7292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007293PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject *
7299unicode_index(PyUnicodeObject *self, PyObject *args)
7300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007302 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007303 Py_ssize_t start;
7304 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
Christian Heimes9cd17752007-11-18 19:35:23 +00007306 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
Thomas Wouters477c8d52006-05-27 19:21:47 +00007309 result = stringlib_find_slice(
7310 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7311 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7312 start, end
7313 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 if (result < 0) {
7318 PyErr_SetString(PyExc_ValueError, "substring not found");
7319 return NULL;
7320 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007321
Christian Heimes217cfd12007-12-02 14:31:20 +00007322 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323}
7324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007328Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007332unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
7334 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7335 register const Py_UNICODE *e;
7336 int cased;
7337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 /* Shortcut for single character strings */
7339 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007342 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007343 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007345
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 e = p + PyUnicode_GET_SIZE(self);
7347 cased = 0;
7348 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7352 return PyBool_FromLong(0);
7353 else if (!cased && Py_UNICODE_ISLOWER(ch))
7354 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007356 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357}
7358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007362Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007363at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
7365static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007366unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367{
7368 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7369 register const Py_UNICODE *e;
7370 int cased;
7371
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 /* Shortcut for single character strings */
7373 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007376 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007377 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007379
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 e = p + PyUnicode_GET_SIZE(self);
7381 cased = 0;
7382 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007384
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7386 return PyBool_FromLong(0);
7387 else if (!cased && Py_UNICODE_ISUPPER(ch))
7388 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007390 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007396Return True if S is a titlecased string and there is at least one\n\
7397character in S, i.e. upper- and titlecase characters may only\n\
7398follow uncased characters and lowercase characters only cased ones.\n\
7399Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007402unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403{
7404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7405 register const Py_UNICODE *e;
7406 int cased, previous_is_cased;
7407
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 /* Shortcut for single character strings */
7409 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7411 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007414 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 e = p + PyUnicode_GET_SIZE(self);
7418 cased = 0;
7419 previous_is_cased = 0;
7420 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007422
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7424 if (previous_is_cased)
7425 return PyBool_FromLong(0);
7426 previous_is_cased = 1;
7427 cased = 1;
7428 }
7429 else if (Py_UNICODE_ISLOWER(ch)) {
7430 if (!previous_is_cased)
7431 return PyBool_FromLong(0);
7432 previous_is_cased = 1;
7433 cased = 1;
7434 }
7435 else
7436 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007444Return True if all characters in S are whitespace\n\
7445and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007448unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7451 register const Py_UNICODE *e;
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Shortcut for single character strings */
7454 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_UNICODE_ISSPACE(*p))
7456 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007459 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 e = p + PyUnicode_GET_SIZE(self);
7463 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (!Py_UNICODE_ISSPACE(*p))
7465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007473Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007475
7476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007477unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007478{
7479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7480 register const Py_UNICODE *e;
7481
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007482 /* Shortcut for single character strings */
7483 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 Py_UNICODE_ISALPHA(*p))
7485 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007486
7487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007488 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007490
7491 e = p + PyUnicode_GET_SIZE(self);
7492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 if (!Py_UNICODE_ISALPHA(*p))
7494 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007497}
7498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007501\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007502Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007504
7505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007506unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007507{
7508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7509 register const Py_UNICODE *e;
7510
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007511 /* Shortcut for single character strings */
7512 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 Py_UNICODE_ISALNUM(*p))
7514 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007515
7516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007517 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007519
7520 e = p + PyUnicode_GET_SIZE(self);
7521 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (!Py_UNICODE_ISALNUM(*p))
7523 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007525 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007526}
7527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007531Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007532False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007535unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536{
7537 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7538 register const Py_UNICODE *e;
7539
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 /* Shortcut for single character strings */
7541 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 Py_UNICODE_ISDECIMAL(*p))
7543 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007545 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007546 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 e = p + PyUnicode_GET_SIZE(self);
7550 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 if (!Py_UNICODE_ISDECIMAL(*p))
7552 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007554 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555}
7556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007560Return True if all characters in S are digits\n\
7561and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007564unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
7566 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7567 register const Py_UNICODE *e;
7568
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 /* Shortcut for single character strings */
7570 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 Py_UNICODE_ISDIGIT(*p))
7572 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007574 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007575 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007577
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 e = p + PyUnicode_GET_SIZE(self);
7579 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 if (!Py_UNICODE_ISDIGIT(*p))
7581 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007583 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584}
7585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007586PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007589Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007593unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594{
7595 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7596 register const Py_UNICODE *e;
7597
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 /* Shortcut for single character strings */
7599 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 Py_UNICODE_ISNUMERIC(*p))
7601 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007603 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007604 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 e = p + PyUnicode_GET_SIZE(self);
7608 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 if (!Py_UNICODE_ISNUMERIC(*p))
7610 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007612 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis47383402007-08-15 07:32:56 +00007615int
7616PyUnicode_IsIdentifier(PyObject *self)
7617{
7618 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7619 register const Py_UNICODE *e;
7620
7621 /* Special case for empty strings */
7622 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007624
7625 /* PEP 3131 says that the first character must be in
7626 XID_Start and subsequent characters in XID_Continue,
7627 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007629 letters, digits, underscore). However, given the current
7630 definition of XID_Start and XID_Continue, it is sufficient
7631 to check just for these, except that _ must be allowed
7632 as starting an identifier. */
7633 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7634 return 0;
7635
7636 e = p + PyUnicode_GET_SIZE(self);
7637 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 if (!_PyUnicode_IsXidContinue(*p))
7639 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007640 }
7641 return 1;
7642}
7643
7644PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007646\n\
7647Return True if S is a valid identifier according\n\
7648to the language definition.");
7649
7650static PyObject*
7651unicode_isidentifier(PyObject *self)
7652{
7653 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7654}
7655
Georg Brandl559e5d72008-06-11 18:37:52 +00007656PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007658\n\
7659Return True if all characters in S are considered\n\
7660printable in repr() or S is empty, False otherwise.");
7661
7662static PyObject*
7663unicode_isprintable(PyObject *self)
7664{
7665 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7666 register const Py_UNICODE *e;
7667
7668 /* Shortcut for single character strings */
7669 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7670 Py_RETURN_TRUE;
7671 }
7672
7673 e = p + PyUnicode_GET_SIZE(self);
7674 for (; p < e; p++) {
7675 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7676 Py_RETURN_FALSE;
7677 }
7678 }
7679 Py_RETURN_TRUE;
7680}
7681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007683 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684\n\
7685Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007686iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687
7688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007689unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692}
7693
Martin v. Löwis18e16552006-02-15 17:27:45 +00007694static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695unicode_length(PyUnicodeObject *self)
7696{
7697 return self->length;
7698}
7699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007700PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007703Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007704done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706static PyObject *
7707unicode_ljust(PyUnicodeObject *self, PyObject *args)
7708{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007709 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007710 Py_UNICODE fillchar = ' ';
7711
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007712 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 return NULL;
7714
Tim Peters7a29bd52001-09-12 03:03:31 +00007715 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 Py_INCREF(self);
7717 return (PyObject*) self;
7718 }
7719
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007720 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721}
7722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007723PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
7728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007729unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 return fixup(self, fixlower);
7732}
7733
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007734#define LEFTSTRIP 0
7735#define RIGHTSTRIP 1
7736#define BOTHSTRIP 2
7737
7738/* Arrays indexed by above */
7739static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7740
7741#define STRIPNAME(i) (stripformat[i]+3)
7742
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007743/* externally visible for str.strip(unicode) */
7744PyObject *
7745_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7746{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7748 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7749 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7750 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7751 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007752
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007754
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 i = 0;
7756 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7758 i++;
7759 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007761
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 j = len;
7763 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 do {
7765 j--;
7766 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7767 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007768 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769
Benjamin Peterson14339b62009-01-31 16:36:08 +00007770 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 Py_INCREF(self);
7772 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 }
7774 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007776}
7777
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
7779static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007780do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7783 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007784
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 i = 0;
7786 if (striptype != RIGHTSTRIP) {
7787 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7788 i++;
7789 }
7790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 j = len;
7793 if (striptype != LEFTSTRIP) {
7794 do {
7795 j--;
7796 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7797 j++;
7798 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007799
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7801 Py_INCREF(self);
7802 return (PyObject*)self;
7803 }
7804 else
7805 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806}
7807
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808
7809static PyObject *
7810do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7811{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7815 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 if (sep != NULL && sep != Py_None) {
7818 if (PyUnicode_Check(sep))
7819 return _PyUnicode_XStrip(self, striptype, sep);
7820 else {
7821 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 "%s arg must be None or str",
7823 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007824 return NULL;
7825 }
7826 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007827
Benjamin Peterson14339b62009-01-31 16:36:08 +00007828 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007829}
7830
7831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007832PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007834\n\
7835Return a copy of the string S with leading and trailing\n\
7836whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007837If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838
7839static PyObject *
7840unicode_strip(PyUnicodeObject *self, PyObject *args)
7841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007842 if (PyTuple_GET_SIZE(args) == 0)
7843 return do_strip(self, BOTHSTRIP); /* Common case */
7844 else
7845 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007846}
7847
7848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007849PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851\n\
7852Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007853If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007854
7855static PyObject *
7856unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7857{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 if (PyTuple_GET_SIZE(args) == 0)
7859 return do_strip(self, LEFTSTRIP); /* Common case */
7860 else
7861 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007862}
7863
7864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007865PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007867\n\
7868Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007869If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007870
7871static PyObject *
7872unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 if (PyTuple_GET_SIZE(args) == 0)
7875 return do_strip(self, RIGHTSTRIP); /* Common case */
7876 else
7877 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007878}
7879
7880
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007882unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883{
7884 PyUnicodeObject *u;
7885 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007886 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007887 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888
Georg Brandl222de0f2009-04-12 12:01:50 +00007889 if (len < 1) {
7890 Py_INCREF(unicode_empty);
7891 return (PyObject *)unicode_empty;
7892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893
Tim Peters7a29bd52001-09-12 03:03:31 +00007894 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 /* no repeat, return original string */
7896 Py_INCREF(str);
7897 return (PyObject*) str;
7898 }
Tim Peters8f422462000-09-09 06:13:41 +00007899
7900 /* ensure # of chars needed doesn't overflow int and # of bytes
7901 * needed doesn't overflow size_t
7902 */
7903 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007904 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007905 PyErr_SetString(PyExc_OverflowError,
7906 "repeated string is too long");
7907 return NULL;
7908 }
7909 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7910 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7911 PyErr_SetString(PyExc_OverflowError,
7912 "repeated string is too long");
7913 return NULL;
7914 }
7915 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 if (!u)
7917 return NULL;
7918
7919 p = u->str;
7920
Georg Brandl222de0f2009-04-12 12:01:50 +00007921 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007922 Py_UNICODE_FILL(p, str->str[0], len);
7923 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007924 Py_ssize_t done = str->length; /* number of characters copied this far */
7925 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007927 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007928 Py_UNICODE_COPY(p+done, p, n);
7929 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 }
7932
7933 return (PyObject*) u;
7934}
7935
7936PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 PyObject *subobj,
7938 PyObject *replobj,
7939 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940{
7941 PyObject *self;
7942 PyObject *str1;
7943 PyObject *str2;
7944 PyObject *result;
7945
7946 self = PyUnicode_FromObject(obj);
7947 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 str1 = PyUnicode_FromObject(subobj);
7950 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 Py_DECREF(self);
7952 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 }
7954 str2 = PyUnicode_FromObject(replobj);
7955 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 Py_DECREF(self);
7957 Py_DECREF(str1);
7958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
Tim Petersced69f82003-09-16 20:30:58 +00007960 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 (PyUnicodeObject *)str1,
7962 (PyUnicodeObject *)str2,
7963 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 Py_DECREF(self);
7965 Py_DECREF(str1);
7966 Py_DECREF(str2);
7967 return result;
7968}
7969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007970PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007971 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972\n\
7973Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007974old replaced by new. If the optional argument count is\n\
7975given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
7977static PyObject*
7978unicode_replace(PyUnicodeObject *self, PyObject *args)
7979{
7980 PyUnicodeObject *str1;
7981 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 PyObject *result;
7984
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 return NULL;
7987 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7988 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007991 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 Py_DECREF(str1);
7993 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995
7996 result = replace(self, str1, str2, maxcount);
7997
7998 Py_DECREF(str1);
7999 Py_DECREF(str2);
8000 return result;
8001}
8002
8003static
8004PyObject *unicode_repr(PyObject *unicode)
8005{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008006 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008007 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008008 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8009 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8010
8011 /* XXX(nnorwitz): rather than over-allocating, it would be
8012 better to choose a different scheme. Perhaps scan the
8013 first N-chars of the string and allocate based on that size.
8014 */
8015 /* Initial allocation is based on the longest-possible unichr
8016 escape.
8017
8018 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8019 unichr, so in this case it's the longest unichr escape. In
8020 narrow (UTF-16) builds this is five chars per source unichr
8021 since there are two unichrs in the surrogate pair, so in narrow
8022 (UTF-16) builds it's not the longest unichr escape.
8023
8024 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8025 so in the narrow (UTF-16) build case it's the longest unichr
8026 escape.
8027 */
8028
Walter Dörwald1ab83302007-05-18 17:15:44 +00008029 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008031#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008033#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008035#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008037 if (repr == NULL)
8038 return NULL;
8039
Walter Dörwald1ab83302007-05-18 17:15:44 +00008040 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008041
8042 /* Add quote */
8043 *p++ = (findchar(s, size, '\'') &&
8044 !findchar(s, size, '"')) ? '"' : '\'';
8045 while (size-- > 0) {
8046 Py_UNICODE ch = *s++;
8047
8048 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008049 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008050 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008051 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008052 continue;
8053 }
8054
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008056 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008057 *p++ = '\\';
8058 *p++ = 't';
8059 }
8060 else if (ch == '\n') {
8061 *p++ = '\\';
8062 *p++ = 'n';
8063 }
8064 else if (ch == '\r') {
8065 *p++ = '\\';
8066 *p++ = 'r';
8067 }
8068
8069 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008070 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008071 *p++ = '\\';
8072 *p++ = 'x';
8073 *p++ = hexdigits[(ch >> 4) & 0x000F];
8074 *p++ = hexdigits[ch & 0x000F];
8075 }
8076
Georg Brandl559e5d72008-06-11 18:37:52 +00008077 /* Copy ASCII characters as-is */
8078 else if (ch < 0x7F) {
8079 *p++ = ch;
8080 }
8081
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008083 else {
8084 Py_UCS4 ucs = ch;
8085
8086#ifndef Py_UNICODE_WIDE
8087 Py_UNICODE ch2 = 0;
8088 /* Get code point from surrogate pair */
8089 if (size > 0) {
8090 ch2 = *s;
8091 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008096 size--;
8097 }
8098 }
8099#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008101 (categories Z* and C* except ASCII space)
8102 */
8103 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8104 /* Map 8-bit characters to '\xhh' */
8105 if (ucs <= 0xff) {
8106 *p++ = '\\';
8107 *p++ = 'x';
8108 *p++ = hexdigits[(ch >> 4) & 0x000F];
8109 *p++ = hexdigits[ch & 0x000F];
8110 }
8111 /* Map 21-bit characters to '\U00xxxxxx' */
8112 else if (ucs >= 0x10000) {
8113 *p++ = '\\';
8114 *p++ = 'U';
8115 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8116 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8117 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8118 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8119 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8120 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8121 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8122 *p++ = hexdigits[ucs & 0x0000000F];
8123 }
8124 /* Map 16-bit characters to '\uxxxx' */
8125 else {
8126 *p++ = '\\';
8127 *p++ = 'u';
8128 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8129 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8130 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8131 *p++ = hexdigits[ucs & 0x000F];
8132 }
8133 }
8134 /* Copy characters as-is */
8135 else {
8136 *p++ = ch;
8137#ifndef Py_UNICODE_WIDE
8138 if (ucs >= 0x10000)
8139 *p++ = ch2;
8140#endif
8141 }
8142 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008143 }
8144 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008145 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008146
8147 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008148 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008149 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150}
8151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008152PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154\n\
8155Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008156such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157arguments start and end are interpreted as in slice notation.\n\
8158\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008159Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
8161static PyObject *
8162unicode_rfind(PyUnicodeObject *self, PyObject *args)
8163{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008164 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008165 Py_ssize_t start;
8166 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008167 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
Christian Heimes9cd17752007-11-18 19:35:23 +00008169 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
Thomas Wouters477c8d52006-05-27 19:21:47 +00008172 result = stringlib_rfind_slice(
8173 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8174 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8175 start, end
8176 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
8178 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008179
Christian Heimes217cfd12007-12-02 14:31:20 +00008180 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181}
8182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008183PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008186Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187
8188static PyObject *
8189unicode_rindex(PyUnicodeObject *self, PyObject *args)
8190{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008191 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008192 Py_ssize_t start;
8193 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
Christian Heimes9cd17752007-11-18 19:35:23 +00008196 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199 result = stringlib_rfind_slice(
8200 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8201 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8202 start, end
8203 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
8205 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008206
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 if (result < 0) {
8208 PyErr_SetString(PyExc_ValueError, "substring not found");
8209 return NULL;
8210 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008211 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212}
8213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008214PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008217Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008218done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
8220static PyObject *
8221unicode_rjust(PyUnicodeObject *self, PyObject *args)
8222{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008223 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008224 Py_UNICODE fillchar = ' ';
8225
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008226 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 return NULL;
8228
Tim Peters7a29bd52001-09-12 03:03:31 +00008229 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230 Py_INCREF(self);
8231 return (PyObject*) self;
8232 }
8233
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008234 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235}
8236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 PyObject *sep,
8239 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
8241 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008242
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 s = PyUnicode_FromObject(s);
8244 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 if (sep != NULL) {
8247 sep = PyUnicode_FromObject(sep);
8248 if (sep == NULL) {
8249 Py_DECREF(s);
8250 return NULL;
8251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
8253
8254 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8255
8256 Py_DECREF(s);
8257 Py_XDECREF(sep);
8258 return result;
8259}
8260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008261PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263\n\
8264Return a list of the words in S, using sep as the\n\
8265delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008266splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008267whitespace string is a separator and empty strings are\n\
8268removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
8270static PyObject*
8271unicode_split(PyUnicodeObject *self, PyObject *args)
8272{
8273 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 return NULL;
8278
8279 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285}
8286
Thomas Wouters477c8d52006-05-27 19:21:47 +00008287PyObject *
8288PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8289{
8290 PyObject* str_obj;
8291 PyObject* sep_obj;
8292 PyObject* out;
8293
8294 str_obj = PyUnicode_FromObject(str_in);
8295 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008297 sep_obj = PyUnicode_FromObject(sep_in);
8298 if (!sep_obj) {
8299 Py_DECREF(str_obj);
8300 return NULL;
8301 }
8302
8303 out = stringlib_partition(
8304 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8305 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8306 );
8307
8308 Py_DECREF(sep_obj);
8309 Py_DECREF(str_obj);
8310
8311 return out;
8312}
8313
8314
8315PyObject *
8316PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8317{
8318 PyObject* str_obj;
8319 PyObject* sep_obj;
8320 PyObject* out;
8321
8322 str_obj = PyUnicode_FromObject(str_in);
8323 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008325 sep_obj = PyUnicode_FromObject(sep_in);
8326 if (!sep_obj) {
8327 Py_DECREF(str_obj);
8328 return NULL;
8329 }
8330
8331 out = stringlib_rpartition(
8332 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8333 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8334 );
8335
8336 Py_DECREF(sep_obj);
8337 Py_DECREF(str_obj);
8338
8339 return out;
8340}
8341
8342PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008344\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008345Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008347found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008348
8349static PyObject*
8350unicode_partition(PyUnicodeObject *self, PyObject *separator)
8351{
8352 return PyUnicode_Partition((PyObject *)self, separator);
8353}
8354
8355PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008356 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008357\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008358Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008359the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008360separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008361
8362static PyObject*
8363unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8364{
8365 return PyUnicode_RPartition((PyObject *)self, separator);
8366}
8367
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008368PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 PyObject *sep,
8370 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008371{
8372 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008373
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008374 s = PyUnicode_FromObject(s);
8375 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if (sep != NULL) {
8378 sep = PyUnicode_FromObject(sep);
8379 if (sep == NULL) {
8380 Py_DECREF(s);
8381 return NULL;
8382 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008383 }
8384
8385 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8386
8387 Py_DECREF(s);
8388 Py_XDECREF(sep);
8389 return result;
8390}
8391
8392PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008394\n\
8395Return a list of the words in S, using sep as the\n\
8396delimiter string, starting at the end of the string and\n\
8397working to the front. If maxsplit is given, at most maxsplit\n\
8398splits are done. If sep is not specified, any whitespace string\n\
8399is a separator.");
8400
8401static PyObject*
8402unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8403{
8404 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008406
Martin v. Löwis18e16552006-02-15 17:27:45 +00008407 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008408 return NULL;
8409
8410 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008412 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008414 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008416}
8417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008418PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420\n\
8421Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008422Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008423is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424
8425static PyObject*
8426unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8427{
Guido van Rossum86662912000-04-11 15:38:46 +00008428 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429
Guido van Rossum86662912000-04-11 15:38:46 +00008430 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 return NULL;
8432
Guido van Rossum86662912000-04-11 15:38:46 +00008433 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434}
8435
8436static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008437PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438{
Walter Dörwald346737f2007-05-31 10:44:43 +00008439 if (PyUnicode_CheckExact(self)) {
8440 Py_INCREF(self);
8441 return self;
8442 } else
8443 /* Subtype -- return genuine unicode string with the same value. */
8444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8445 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446}
8447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008448PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450\n\
8451Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008452and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453
8454static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008455unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 return fixup(self, fixswapcase);
8458}
8459
Georg Brandlceee0772007-11-27 23:48:05 +00008460PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008462\n\
8463Return a translation table usable for str.translate().\n\
8464If there is only one argument, it must be a dictionary mapping Unicode\n\
8465ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008466Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008467If there are two arguments, they must be strings of equal length, and\n\
8468in the resulting dictionary, each character in x will be mapped to the\n\
8469character at the same position in y. If there is a third argument, it\n\
8470must be a string, whose characters will be mapped to None in the result.");
8471
8472static PyObject*
8473unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8474{
8475 PyObject *x, *y = NULL, *z = NULL;
8476 PyObject *new = NULL, *key, *value;
8477 Py_ssize_t i = 0;
8478 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479
Georg Brandlceee0772007-11-27 23:48:05 +00008480 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8481 return NULL;
8482 new = PyDict_New();
8483 if (!new)
8484 return NULL;
8485 if (y != NULL) {
8486 /* x must be a string too, of equal length */
8487 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8488 if (!PyUnicode_Check(x)) {
8489 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8490 "be a string if there is a second argument");
8491 goto err;
8492 }
8493 if (PyUnicode_GET_SIZE(x) != ylen) {
8494 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8495 "arguments must have equal length");
8496 goto err;
8497 }
8498 /* create entries for translating chars in x to those in y */
8499 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008500 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8501 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008502 if (!key || !value)
8503 goto err;
8504 res = PyDict_SetItem(new, key, value);
8505 Py_DECREF(key);
8506 Py_DECREF(value);
8507 if (res < 0)
8508 goto err;
8509 }
8510 /* create entries for deleting chars in z */
8511 if (z != NULL) {
8512 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008513 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008514 if (!key)
8515 goto err;
8516 res = PyDict_SetItem(new, key, Py_None);
8517 Py_DECREF(key);
8518 if (res < 0)
8519 goto err;
8520 }
8521 }
8522 } else {
8523 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008524 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008525 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8526 "to maketrans it must be a dict");
8527 goto err;
8528 }
8529 /* copy entries into the new dict, converting string keys to int keys */
8530 while (PyDict_Next(x, &i, &key, &value)) {
8531 if (PyUnicode_Check(key)) {
8532 /* convert string keys to integer keys */
8533 PyObject *newkey;
8534 if (PyUnicode_GET_SIZE(key) != 1) {
8535 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8536 "table must be of length 1");
8537 goto err;
8538 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008539 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008540 if (!newkey)
8541 goto err;
8542 res = PyDict_SetItem(new, newkey, value);
8543 Py_DECREF(newkey);
8544 if (res < 0)
8545 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008546 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008547 /* just keep integer keys */
8548 if (PyDict_SetItem(new, key, value) < 0)
8549 goto err;
8550 } else {
8551 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8552 "be strings or integers");
8553 goto err;
8554 }
8555 }
8556 }
8557 return new;
8558 err:
8559 Py_DECREF(new);
8560 return NULL;
8561}
8562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008563PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565\n\
8566Return a copy of the string S, where all characters have been mapped\n\
8567through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008568Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008569Unmapped characters are left untouched. Characters mapped to None\n\
8570are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
8572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008573unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Georg Brandlceee0772007-11-27 23:48:05 +00008575 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576}
8577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008578PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008581Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
8583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008584unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 return fixup(self, fixupper);
8587}
8588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008589PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008592Pad a numeric string S with zeros on the left, to fill a field\n\
8593of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595static PyObject *
8596unicode_zfill(PyUnicodeObject *self, PyObject *args)
8597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008598 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 PyUnicodeObject *u;
8600
Martin v. Löwis18e16552006-02-15 17:27:45 +00008601 Py_ssize_t width;
8602 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 return NULL;
8604
8605 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008606 if (PyUnicode_CheckExact(self)) {
8607 Py_INCREF(self);
8608 return (PyObject*) self;
8609 }
8610 else
8611 return PyUnicode_FromUnicode(
8612 PyUnicode_AS_UNICODE(self),
8613 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
8616
8617 fill = width - self->length;
8618
8619 u = pad(self, fill, 0, '0');
8620
Walter Dörwald068325e2002-04-15 13:36:47 +00008621 if (u == NULL)
8622 return NULL;
8623
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 if (u->str[fill] == '+' || u->str[fill] == '-') {
8625 /* move sign to beginning of string */
8626 u->str[0] = u->str[fill];
8627 u->str[fill] = '0';
8628 }
8629
8630 return (PyObject*) u;
8631}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632
8633#if 0
8634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008635unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Christian Heimes2202f872008-02-06 14:31:34 +00008637 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638}
8639#endif
8640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008641PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008644Return True if S starts with the specified prefix, False otherwise.\n\
8645With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008646With optional end, stop comparing S at that position.\n\
8647prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648
8649static PyObject *
8650unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008653 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008655 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008656 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008657 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008659 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8661 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008662 if (PyTuple_Check(subobj)) {
8663 Py_ssize_t i;
8664 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8665 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008667 if (substring == NULL)
8668 return NULL;
8669 result = tailmatch(self, substring, start, end, -1);
8670 Py_DECREF(substring);
8671 if (result) {
8672 Py_RETURN_TRUE;
8673 }
8674 }
8675 /* nothing matched */
8676 Py_RETURN_FALSE;
8677 }
8678 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008681 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684}
8685
8686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008687PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008690Return True if S ends with the specified suffix, False otherwise.\n\
8691With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008692With optional end, stop comparing S at that position.\n\
8693suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694
8695static PyObject *
8696unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008699 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008701 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008702 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008703 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008705 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8707 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008708 if (PyTuple_Check(subobj)) {
8709 Py_ssize_t i;
8710 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8711 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008713 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008715 result = tailmatch(self, substring, start, end, +1);
8716 Py_DECREF(substring);
8717 if (result) {
8718 Py_RETURN_TRUE;
8719 }
8720 }
8721 Py_RETURN_FALSE;
8722 }
8723 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008729 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730}
8731
Eric Smith8c663262007-08-25 02:26:07 +00008732#include "stringlib/string_format.h"
8733
8734PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008736\n\
8737");
8738
Eric Smith4a7d76d2008-05-30 18:10:19 +00008739static PyObject *
8740unicode__format__(PyObject* self, PyObject* args)
8741{
8742 PyObject *format_spec;
8743
8744 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8745 return NULL;
8746
8747 return _PyUnicode_FormatAdvanced(self,
8748 PyUnicode_AS_UNICODE(format_spec),
8749 PyUnicode_GET_SIZE(format_spec));
8750}
8751
Eric Smith8c663262007-08-25 02:26:07 +00008752PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008754\n\
8755");
8756
8757static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008758unicode__sizeof__(PyUnicodeObject *v)
8759{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008760 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8761 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008762}
8763
8764PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008766
8767static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008768unicode_getnewargs(PyUnicodeObject *v)
8769{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008771}
8772
8773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774static PyMethodDef unicode_methods[] = {
8775
8776 /* Order is according to common usage: often used methods should
8777 appear first, since lookup is done sequentially. */
8778
Benjamin Peterson308d6372009-09-18 21:42:35 +00008779 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008780 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8781 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008782 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008783 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8784 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8785 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8786 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8787 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8788 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8789 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008790 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008791 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8792 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8793 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008794 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008795 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8796 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8797 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008798 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008799 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008800 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008801 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008802 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8803 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8804 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8805 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8806 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8807 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8808 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8809 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8810 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8811 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8812 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8813 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8814 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8815 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008816 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008817 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008818 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008819 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008820 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008821 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8822 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008823 {"maketrans", (PyCFunction) unicode_maketrans,
8824 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008825 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008826#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008827 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828#endif
8829
8830#if 0
8831 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008832 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833#endif
8834
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 {NULL, NULL}
8837};
8838
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008839static PyObject *
8840unicode_mod(PyObject *v, PyObject *w)
8841{
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 if (!PyUnicode_Check(v)) {
8843 Py_INCREF(Py_NotImplemented);
8844 return Py_NotImplemented;
8845 }
8846 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008847}
8848
8849static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008850 0, /*nb_add*/
8851 0, /*nb_subtract*/
8852 0, /*nb_multiply*/
8853 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008854};
8855
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 (lenfunc) unicode_length, /* sq_length */
8858 PyUnicode_Concat, /* sq_concat */
8859 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8860 (ssizeargfunc) unicode_getitem, /* sq_item */
8861 0, /* sq_slice */
8862 0, /* sq_ass_item */
8863 0, /* sq_ass_slice */
8864 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865};
8866
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008867static PyObject*
8868unicode_subscript(PyUnicodeObject* self, PyObject* item)
8869{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008870 if (PyIndex_Check(item)) {
8871 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008872 if (i == -1 && PyErr_Occurred())
8873 return NULL;
8874 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008875 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008876 return unicode_getitem(self, i);
8877 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008878 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008879 Py_UNICODE* source_buf;
8880 Py_UNICODE* result_buf;
8881 PyObject* result;
8882
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008883 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008885 return NULL;
8886 }
8887
8888 if (slicelength <= 0) {
8889 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008890 } else if (start == 0 && step == 1 && slicelength == self->length &&
8891 PyUnicode_CheckExact(self)) {
8892 Py_INCREF(self);
8893 return (PyObject *)self;
8894 } else if (step == 1) {
8895 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008896 } else {
8897 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008898 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8899 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008900
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 if (result_buf == NULL)
8902 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008903
8904 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8905 result_buf[i] = source_buf[cur];
8906 }
Tim Petersced69f82003-09-16 20:30:58 +00008907
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008908 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008909 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008910 return result;
8911 }
8912 } else {
8913 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8914 return NULL;
8915 }
8916}
8917
8918static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 (lenfunc)unicode_length, /* mp_length */
8920 (binaryfunc)unicode_subscript, /* mp_subscript */
8921 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008922};
8923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925/* Helpers for PyUnicode_Format() */
8926
8927static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008928getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008930 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 (*p_argidx)++;
8933 if (arglen < 0)
8934 return args;
8935 else
8936 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 }
8938 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 return NULL;
8941}
8942
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008943/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008945static PyObject *
8946formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008948 char *p;
8949 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008951
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 x = PyFloat_AsDouble(v);
8953 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008954 return NULL;
8955
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008958
Eric Smith0923d1d2009-04-16 20:16:10 +00008959 p = PyOS_double_to_string(x, type, prec,
8960 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008961 if (p == NULL)
8962 return NULL;
8963 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008964 PyMem_Free(p);
8965 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966}
8967
Tim Peters38fd5b62000-09-21 05:43:11 +00008968static PyObject*
8969formatlong(PyObject *val, int flags, int prec, int type)
8970{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008971 char *buf;
8972 int len;
8973 PyObject *str; /* temporary string object. */
8974 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008975
Benjamin Peterson14339b62009-01-31 16:36:08 +00008976 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8977 if (!str)
8978 return NULL;
8979 result = PyUnicode_FromStringAndSize(buf, len);
8980 Py_DECREF(str);
8981 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008982}
8983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984static int
8985formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008986 size_t buflen,
8987 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008989 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008990 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 if (PyUnicode_GET_SIZE(v) == 1) {
8992 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8993 buf[1] = '\0';
8994 return 1;
8995 }
8996#ifndef Py_UNICODE_WIDE
8997 if (PyUnicode_GET_SIZE(v) == 2) {
8998 /* Decode a valid surrogate pair */
8999 int c0 = PyUnicode_AS_UNICODE(v)[0];
9000 int c1 = PyUnicode_AS_UNICODE(v)[1];
9001 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9002 0xDC00 <= c1 && c1 <= 0xDFFF) {
9003 buf[0] = c0;
9004 buf[1] = c1;
9005 buf[2] = '\0';
9006 return 2;
9007 }
9008 }
9009#endif
9010 goto onError;
9011 }
9012 else {
9013 /* Integer input truncated to a character */
9014 long x;
9015 x = PyLong_AsLong(v);
9016 if (x == -1 && PyErr_Occurred())
9017 goto onError;
9018
9019 if (x < 0 || x > 0x10ffff) {
9020 PyErr_SetString(PyExc_OverflowError,
9021 "%c arg not in range(0x110000)");
9022 return -1;
9023 }
9024
9025#ifndef Py_UNICODE_WIDE
9026 if (x > 0xffff) {
9027 x -= 0x10000;
9028 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9029 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9030 return 2;
9031 }
9032#endif
9033 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009034 buf[1] = '\0';
9035 return 1;
9036 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009037
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009039 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009041 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042}
9043
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009044/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009045 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009046*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009047#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009048
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
9052 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009053 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054 int args_owned = 0;
9055 PyUnicodeObject *result = NULL;
9056 PyObject *dict = NULL;
9057 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009058
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 PyErr_BadInternalCall();
9061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062 }
9063 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009064 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066 fmt = PyUnicode_AS_UNICODE(uformat);
9067 fmtcnt = PyUnicode_GET_SIZE(uformat);
9068
9069 reslen = rescnt = fmtcnt + 100;
9070 result = _PyUnicode_New(reslen);
9071 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 res = PyUnicode_AS_UNICODE(result);
9074
9075 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 arglen = PyTuple_Size(args);
9077 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078 }
9079 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 arglen = -1;
9081 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009083 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009084 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086
9087 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 if (*fmt != '%') {
9089 if (--rescnt < 0) {
9090 rescnt = fmtcnt + 100;
9091 reslen += rescnt;
9092 if (_PyUnicode_Resize(&result, reslen) < 0)
9093 goto onError;
9094 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9095 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009096 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009098 }
9099 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 /* Got a format specifier */
9101 int flags = 0;
9102 Py_ssize_t width = -1;
9103 int prec = -1;
9104 Py_UNICODE c = '\0';
9105 Py_UNICODE fill;
9106 int isnumok;
9107 PyObject *v = NULL;
9108 PyObject *temp = NULL;
9109 Py_UNICODE *pbuf;
9110 Py_UNICODE sign;
9111 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009112 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 fmt++;
9115 if (*fmt == '(') {
9116 Py_UNICODE *keystart;
9117 Py_ssize_t keylen;
9118 PyObject *key;
9119 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009120
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 if (dict == NULL) {
9122 PyErr_SetString(PyExc_TypeError,
9123 "format requires a mapping");
9124 goto onError;
9125 }
9126 ++fmt;
9127 --fmtcnt;
9128 keystart = fmt;
9129 /* Skip over balanced parentheses */
9130 while (pcount > 0 && --fmtcnt >= 0) {
9131 if (*fmt == ')')
9132 --pcount;
9133 else if (*fmt == '(')
9134 ++pcount;
9135 fmt++;
9136 }
9137 keylen = fmt - keystart - 1;
9138 if (fmtcnt < 0 || pcount > 0) {
9139 PyErr_SetString(PyExc_ValueError,
9140 "incomplete format key");
9141 goto onError;
9142 }
9143#if 0
9144 /* keys are converted to strings using UTF-8 and
9145 then looked up since Python uses strings to hold
9146 variables names etc. in its namespaces and we
9147 wouldn't want to break common idioms. */
9148 key = PyUnicode_EncodeUTF8(keystart,
9149 keylen,
9150 NULL);
9151#else
9152 key = PyUnicode_FromUnicode(keystart, keylen);
9153#endif
9154 if (key == NULL)
9155 goto onError;
9156 if (args_owned) {
9157 Py_DECREF(args);
9158 args_owned = 0;
9159 }
9160 args = PyObject_GetItem(dict, key);
9161 Py_DECREF(key);
9162 if (args == NULL) {
9163 goto onError;
9164 }
9165 args_owned = 1;
9166 arglen = -1;
9167 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009168 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 while (--fmtcnt >= 0) {
9170 switch (c = *fmt++) {
9171 case '-': flags |= F_LJUST; continue;
9172 case '+': flags |= F_SIGN; continue;
9173 case ' ': flags |= F_BLANK; continue;
9174 case '#': flags |= F_ALT; continue;
9175 case '0': flags |= F_ZERO; continue;
9176 }
9177 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009178 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 if (c == '*') {
9180 v = getnextarg(args, arglen, &argidx);
9181 if (v == NULL)
9182 goto onError;
9183 if (!PyLong_Check(v)) {
9184 PyErr_SetString(PyExc_TypeError,
9185 "* wants int");
9186 goto onError;
9187 }
9188 width = PyLong_AsLong(v);
9189 if (width == -1 && PyErr_Occurred())
9190 goto onError;
9191 if (width < 0) {
9192 flags |= F_LJUST;
9193 width = -width;
9194 }
9195 if (--fmtcnt >= 0)
9196 c = *fmt++;
9197 }
9198 else if (c >= '0' && c <= '9') {
9199 width = c - '0';
9200 while (--fmtcnt >= 0) {
9201 c = *fmt++;
9202 if (c < '0' || c > '9')
9203 break;
9204 if ((width*10) / 10 != width) {
9205 PyErr_SetString(PyExc_ValueError,
9206 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009207 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 }
9209 width = width*10 + (c - '0');
9210 }
9211 }
9212 if (c == '.') {
9213 prec = 0;
9214 if (--fmtcnt >= 0)
9215 c = *fmt++;
9216 if (c == '*') {
9217 v = getnextarg(args, arglen, &argidx);
9218 if (v == NULL)
9219 goto onError;
9220 if (!PyLong_Check(v)) {
9221 PyErr_SetString(PyExc_TypeError,
9222 "* wants int");
9223 goto onError;
9224 }
9225 prec = PyLong_AsLong(v);
9226 if (prec == -1 && PyErr_Occurred())
9227 goto onError;
9228 if (prec < 0)
9229 prec = 0;
9230 if (--fmtcnt >= 0)
9231 c = *fmt++;
9232 }
9233 else if (c >= '0' && c <= '9') {
9234 prec = c - '0';
9235 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009236 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 if (c < '0' || c > '9')
9238 break;
9239 if ((prec*10) / 10 != prec) {
9240 PyErr_SetString(PyExc_ValueError,
9241 "prec too big");
9242 goto onError;
9243 }
9244 prec = prec*10 + (c - '0');
9245 }
9246 }
9247 } /* prec */
9248 if (fmtcnt >= 0) {
9249 if (c == 'h' || c == 'l' || c == 'L') {
9250 if (--fmtcnt >= 0)
9251 c = *fmt++;
9252 }
9253 }
9254 if (fmtcnt < 0) {
9255 PyErr_SetString(PyExc_ValueError,
9256 "incomplete format");
9257 goto onError;
9258 }
9259 if (c != '%') {
9260 v = getnextarg(args, arglen, &argidx);
9261 if (v == NULL)
9262 goto onError;
9263 }
9264 sign = 0;
9265 fill = ' ';
9266 switch (c) {
9267
9268 case '%':
9269 pbuf = formatbuf;
9270 /* presume that buffer length is at least 1 */
9271 pbuf[0] = '%';
9272 len = 1;
9273 break;
9274
9275 case 's':
9276 case 'r':
9277 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009278 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009279 temp = v;
9280 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009281 }
9282 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009283 if (c == 's')
9284 temp = PyObject_Str(v);
9285 else if (c == 'r')
9286 temp = PyObject_Repr(v);
9287 else
9288 temp = PyObject_ASCII(v);
9289 if (temp == NULL)
9290 goto onError;
9291 if (PyUnicode_Check(temp))
9292 /* nothing to do */;
9293 else {
9294 Py_DECREF(temp);
9295 PyErr_SetString(PyExc_TypeError,
9296 "%s argument has non-string str()");
9297 goto onError;
9298 }
9299 }
9300 pbuf = PyUnicode_AS_UNICODE(temp);
9301 len = PyUnicode_GET_SIZE(temp);
9302 if (prec >= 0 && len > prec)
9303 len = prec;
9304 break;
9305
9306 case 'i':
9307 case 'd':
9308 case 'u':
9309 case 'o':
9310 case 'x':
9311 case 'X':
9312 if (c == 'i')
9313 c = 'd';
9314 isnumok = 0;
9315 if (PyNumber_Check(v)) {
9316 PyObject *iobj=NULL;
9317
9318 if (PyLong_Check(v)) {
9319 iobj = v;
9320 Py_INCREF(iobj);
9321 }
9322 else {
9323 iobj = PyNumber_Long(v);
9324 }
9325 if (iobj!=NULL) {
9326 if (PyLong_Check(iobj)) {
9327 isnumok = 1;
9328 temp = formatlong(iobj, flags, prec, c);
9329 Py_DECREF(iobj);
9330 if (!temp)
9331 goto onError;
9332 pbuf = PyUnicode_AS_UNICODE(temp);
9333 len = PyUnicode_GET_SIZE(temp);
9334 sign = 1;
9335 }
9336 else {
9337 Py_DECREF(iobj);
9338 }
9339 }
9340 }
9341 if (!isnumok) {
9342 PyErr_Format(PyExc_TypeError,
9343 "%%%c format: a number is required, "
9344 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9345 goto onError;
9346 }
9347 if (flags & F_ZERO)
9348 fill = '0';
9349 break;
9350
9351 case 'e':
9352 case 'E':
9353 case 'f':
9354 case 'F':
9355 case 'g':
9356 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009357 temp = formatfloat(v, flags, prec, c);
9358 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009360 pbuf = PyUnicode_AS_UNICODE(temp);
9361 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 sign = 1;
9363 if (flags & F_ZERO)
9364 fill = '0';
9365 break;
9366
9367 case 'c':
9368 pbuf = formatbuf;
9369 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9370 if (len < 0)
9371 goto onError;
9372 break;
9373
9374 default:
9375 PyErr_Format(PyExc_ValueError,
9376 "unsupported format character '%c' (0x%x) "
9377 "at index %zd",
9378 (31<=c && c<=126) ? (char)c : '?',
9379 (int)c,
9380 (Py_ssize_t)(fmt - 1 -
9381 PyUnicode_AS_UNICODE(uformat)));
9382 goto onError;
9383 }
9384 if (sign) {
9385 if (*pbuf == '-' || *pbuf == '+') {
9386 sign = *pbuf++;
9387 len--;
9388 }
9389 else if (flags & F_SIGN)
9390 sign = '+';
9391 else if (flags & F_BLANK)
9392 sign = ' ';
9393 else
9394 sign = 0;
9395 }
9396 if (width < len)
9397 width = len;
9398 if (rescnt - (sign != 0) < width) {
9399 reslen -= rescnt;
9400 rescnt = width + fmtcnt + 100;
9401 reslen += rescnt;
9402 if (reslen < 0) {
9403 Py_XDECREF(temp);
9404 PyErr_NoMemory();
9405 goto onError;
9406 }
9407 if (_PyUnicode_Resize(&result, reslen) < 0) {
9408 Py_XDECREF(temp);
9409 goto onError;
9410 }
9411 res = PyUnicode_AS_UNICODE(result)
9412 + reslen - rescnt;
9413 }
9414 if (sign) {
9415 if (fill != ' ')
9416 *res++ = sign;
9417 rescnt--;
9418 if (width > len)
9419 width--;
9420 }
9421 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9422 assert(pbuf[0] == '0');
9423 assert(pbuf[1] == c);
9424 if (fill != ' ') {
9425 *res++ = *pbuf++;
9426 *res++ = *pbuf++;
9427 }
9428 rescnt -= 2;
9429 width -= 2;
9430 if (width < 0)
9431 width = 0;
9432 len -= 2;
9433 }
9434 if (width > len && !(flags & F_LJUST)) {
9435 do {
9436 --rescnt;
9437 *res++ = fill;
9438 } while (--width > len);
9439 }
9440 if (fill == ' ') {
9441 if (sign)
9442 *res++ = sign;
9443 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9444 assert(pbuf[0] == '0');
9445 assert(pbuf[1] == c);
9446 *res++ = *pbuf++;
9447 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009448 }
9449 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 Py_UNICODE_COPY(res, pbuf, len);
9451 res += len;
9452 rescnt -= len;
9453 while (--width >= len) {
9454 --rescnt;
9455 *res++ = ' ';
9456 }
9457 if (dict && (argidx < arglen) && c != '%') {
9458 PyErr_SetString(PyExc_TypeError,
9459 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009460 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 goto onError;
9462 }
9463 Py_XDECREF(temp);
9464 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 } /* until end */
9466 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 PyErr_SetString(PyExc_TypeError,
9468 "not all arguments converted during string formatting");
9469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
9471
Thomas Woutersa96affe2006-03-12 00:29:36 +00009472 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
9477 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 return (PyObject *)result;
9479
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 Py_XDECREF(result);
9482 Py_DECREF(uformat);
9483 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 }
9486 return NULL;
9487}
9488
Jeremy Hylton938ace62002-07-17 16:30:39 +00009489static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009490unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9491
Tim Peters6d6c1a32001-08-02 04:15:00 +00009492static PyObject *
9493unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9494{
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009496 static char *kwlist[] = {"object", "encoding", "errors", 0};
9497 char *encoding = NULL;
9498 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009499
Benjamin Peterson14339b62009-01-31 16:36:08 +00009500 if (type != &PyUnicode_Type)
9501 return unicode_subtype_new(type, args, kwds);
9502 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009504 return NULL;
9505 if (x == NULL)
9506 return (PyObject *)_PyUnicode_New(0);
9507 if (encoding == NULL && errors == NULL)
9508 return PyObject_Str(x);
9509 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009511}
9512
Guido van Rossume023fe02001-08-30 03:12:59 +00009513static PyObject *
9514unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9515{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009516 PyUnicodeObject *tmp, *pnew;
9517 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009518
Benjamin Peterson14339b62009-01-31 16:36:08 +00009519 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9520 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9521 if (tmp == NULL)
9522 return NULL;
9523 assert(PyUnicode_Check(tmp));
9524 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9525 if (pnew == NULL) {
9526 Py_DECREF(tmp);
9527 return NULL;
9528 }
9529 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9530 if (pnew->str == NULL) {
9531 _Py_ForgetReference((PyObject *)pnew);
9532 PyObject_Del(pnew);
9533 Py_DECREF(tmp);
9534 return PyErr_NoMemory();
9535 }
9536 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9537 pnew->length = n;
9538 pnew->hash = tmp->hash;
9539 Py_DECREF(tmp);
9540 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009541}
9542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009543PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009545\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009546Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009547encoding defaults to the current default string encoding.\n\
9548errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009549
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009550static PyObject *unicode_iter(PyObject *seq);
9551
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009553 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009554 "str", /* tp_name */
9555 sizeof(PyUnicodeObject), /* tp_size */
9556 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009558 (destructor)unicode_dealloc, /* tp_dealloc */
9559 0, /* tp_print */
9560 0, /* tp_getattr */
9561 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009562 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009563 unicode_repr, /* tp_repr */
9564 &unicode_as_number, /* tp_as_number */
9565 &unicode_as_sequence, /* tp_as_sequence */
9566 &unicode_as_mapping, /* tp_as_mapping */
9567 (hashfunc) unicode_hash, /* tp_hash*/
9568 0, /* tp_call*/
9569 (reprfunc) unicode_str, /* tp_str */
9570 PyObject_GenericGetAttr, /* tp_getattro */
9571 0, /* tp_setattro */
9572 0, /* tp_as_buffer */
9573 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 unicode_doc, /* tp_doc */
9576 0, /* tp_traverse */
9577 0, /* tp_clear */
9578 PyUnicode_RichCompare, /* tp_richcompare */
9579 0, /* tp_weaklistoffset */
9580 unicode_iter, /* tp_iter */
9581 0, /* tp_iternext */
9582 unicode_methods, /* tp_methods */
9583 0, /* tp_members */
9584 0, /* tp_getset */
9585 &PyBaseObject_Type, /* tp_base */
9586 0, /* tp_dict */
9587 0, /* tp_descr_get */
9588 0, /* tp_descr_set */
9589 0, /* tp_dictoffset */
9590 0, /* tp_init */
9591 0, /* tp_alloc */
9592 unicode_new, /* tp_new */
9593 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594};
9595
9596/* Initialize the Unicode implementation */
9597
Thomas Wouters78890102000-07-22 19:25:51 +00009598void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009600 int i;
9601
Thomas Wouters477c8d52006-05-27 19:21:47 +00009602 /* XXX - move this array to unicodectype.c ? */
9603 Py_UNICODE linebreak[] = {
9604 0x000A, /* LINE FEED */
9605 0x000D, /* CARRIAGE RETURN */
9606 0x001C, /* FILE SEPARATOR */
9607 0x001D, /* GROUP SEPARATOR */
9608 0x001E, /* RECORD SEPARATOR */
9609 0x0085, /* NEXT LINE */
9610 0x2028, /* LINE SEPARATOR */
9611 0x2029, /* PARAGRAPH SEPARATOR */
9612 };
9613
Fred Drakee4315f52000-05-09 19:53:39 +00009614 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009615 free_list = NULL;
9616 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009618 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009620
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009621 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009623 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009625
9626 /* initialize the linebreak bloom filter */
9627 bloom_linebreak = make_bloom_mask(
9628 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9629 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009630
9631 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632}
9633
9634/* Finalize the Unicode implementation */
9635
Christian Heimesa156e092008-02-16 07:38:31 +00009636int
9637PyUnicode_ClearFreeList(void)
9638{
9639 int freelist_size = numfree;
9640 PyUnicodeObject *u;
9641
9642 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 PyUnicodeObject *v = u;
9644 u = *(PyUnicodeObject **)u;
9645 if (v->str)
9646 PyObject_DEL(v->str);
9647 Py_XDECREF(v->defenc);
9648 PyObject_Del(v);
9649 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009650 }
9651 free_list = NULL;
9652 assert(numfree == 0);
9653 return freelist_size;
9654}
9655
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656void
Thomas Wouters78890102000-07-22 19:25:51 +00009657_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009659 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009661 Py_XDECREF(unicode_empty);
9662 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009664 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 if (unicode_latin1[i]) {
9666 Py_DECREF(unicode_latin1[i]);
9667 unicode_latin1[i] = NULL;
9668 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009669 }
Christian Heimesa156e092008-02-16 07:38:31 +00009670 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009672
Walter Dörwald16807132007-05-25 13:52:07 +00009673void
9674PyUnicode_InternInPlace(PyObject **p)
9675{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9677 PyObject *t;
9678 if (s == NULL || !PyUnicode_Check(s))
9679 Py_FatalError(
9680 "PyUnicode_InternInPlace: unicode strings only please!");
9681 /* If it's a subclass, we don't really know what putting
9682 it in the interned dict might do. */
9683 if (!PyUnicode_CheckExact(s))
9684 return;
9685 if (PyUnicode_CHECK_INTERNED(s))
9686 return;
9687 if (interned == NULL) {
9688 interned = PyDict_New();
9689 if (interned == NULL) {
9690 PyErr_Clear(); /* Don't leave an exception */
9691 return;
9692 }
9693 }
9694 /* It might be that the GetItem call fails even
9695 though the key is present in the dictionary,
9696 namely when this happens during a stack overflow. */
9697 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009699 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009700
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 if (t) {
9702 Py_INCREF(t);
9703 Py_DECREF(*p);
9704 *p = t;
9705 return;
9706 }
Walter Dörwald16807132007-05-25 13:52:07 +00009707
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 PyThreadState_GET()->recursion_critical = 1;
9709 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9710 PyErr_Clear();
9711 PyThreadState_GET()->recursion_critical = 0;
9712 return;
9713 }
9714 PyThreadState_GET()->recursion_critical = 0;
9715 /* The two references in interned are not counted by refcnt.
9716 The deallocator will take care of this */
9717 Py_REFCNT(s) -= 2;
9718 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009719}
9720
9721void
9722PyUnicode_InternImmortal(PyObject **p)
9723{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009724 PyUnicode_InternInPlace(p);
9725 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9726 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9727 Py_INCREF(*p);
9728 }
Walter Dörwald16807132007-05-25 13:52:07 +00009729}
9730
9731PyObject *
9732PyUnicode_InternFromString(const char *cp)
9733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 PyObject *s = PyUnicode_FromString(cp);
9735 if (s == NULL)
9736 return NULL;
9737 PyUnicode_InternInPlace(&s);
9738 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009739}
9740
9741void _Py_ReleaseInternedUnicodeStrings(void)
9742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 PyObject *keys;
9744 PyUnicodeObject *s;
9745 Py_ssize_t i, n;
9746 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009747
Benjamin Peterson14339b62009-01-31 16:36:08 +00009748 if (interned == NULL || !PyDict_Check(interned))
9749 return;
9750 keys = PyDict_Keys(interned);
9751 if (keys == NULL || !PyList_Check(keys)) {
9752 PyErr_Clear();
9753 return;
9754 }
Walter Dörwald16807132007-05-25 13:52:07 +00009755
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9757 detector, interned unicode strings are not forcibly deallocated;
9758 rather, we give them their stolen references back, and then clear
9759 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009760
Benjamin Peterson14339b62009-01-31 16:36:08 +00009761 n = PyList_GET_SIZE(keys);
9762 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 for (i = 0; i < n; i++) {
9765 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9766 switch (s->state) {
9767 case SSTATE_NOT_INTERNED:
9768 /* XXX Shouldn't happen */
9769 break;
9770 case SSTATE_INTERNED_IMMORTAL:
9771 Py_REFCNT(s) += 1;
9772 immortal_size += s->length;
9773 break;
9774 case SSTATE_INTERNED_MORTAL:
9775 Py_REFCNT(s) += 2;
9776 mortal_size += s->length;
9777 break;
9778 default:
9779 Py_FatalError("Inconsistent interned string state.");
9780 }
9781 s->state = SSTATE_NOT_INTERNED;
9782 }
9783 fprintf(stderr, "total size of all interned strings: "
9784 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9785 "mortal/immortal\n", mortal_size, immortal_size);
9786 Py_DECREF(keys);
9787 PyDict_Clear(interned);
9788 Py_DECREF(interned);
9789 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009790}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009791
9792
9793/********************* Unicode Iterator **************************/
9794
9795typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009796 PyObject_HEAD
9797 Py_ssize_t it_index;
9798 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009799} unicodeiterobject;
9800
9801static void
9802unicodeiter_dealloc(unicodeiterobject *it)
9803{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009804 _PyObject_GC_UNTRACK(it);
9805 Py_XDECREF(it->it_seq);
9806 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009807}
9808
9809static int
9810unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9811{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009812 Py_VISIT(it->it_seq);
9813 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009814}
9815
9816static PyObject *
9817unicodeiter_next(unicodeiterobject *it)
9818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009819 PyUnicodeObject *seq;
9820 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009821
Benjamin Peterson14339b62009-01-31 16:36:08 +00009822 assert(it != NULL);
9823 seq = it->it_seq;
9824 if (seq == NULL)
9825 return NULL;
9826 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009827
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9829 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 if (item != NULL)
9832 ++it->it_index;
9833 return item;
9834 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009835
Benjamin Peterson14339b62009-01-31 16:36:08 +00009836 Py_DECREF(seq);
9837 it->it_seq = NULL;
9838 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009839}
9840
9841static PyObject *
9842unicodeiter_len(unicodeiterobject *it)
9843{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844 Py_ssize_t len = 0;
9845 if (it->it_seq)
9846 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9847 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009848}
9849
9850PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9851
9852static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009854 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009855 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009856};
9857
9858PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009859 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9860 "str_iterator", /* tp_name */
9861 sizeof(unicodeiterobject), /* tp_basicsize */
9862 0, /* tp_itemsize */
9863 /* methods */
9864 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9865 0, /* tp_print */
9866 0, /* tp_getattr */
9867 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009868 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 0, /* tp_repr */
9870 0, /* tp_as_number */
9871 0, /* tp_as_sequence */
9872 0, /* tp_as_mapping */
9873 0, /* tp_hash */
9874 0, /* tp_call */
9875 0, /* tp_str */
9876 PyObject_GenericGetAttr, /* tp_getattro */
9877 0, /* tp_setattro */
9878 0, /* tp_as_buffer */
9879 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9880 0, /* tp_doc */
9881 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9882 0, /* tp_clear */
9883 0, /* tp_richcompare */
9884 0, /* tp_weaklistoffset */
9885 PyObject_SelfIter, /* tp_iter */
9886 (iternextfunc)unicodeiter_next, /* tp_iternext */
9887 unicodeiter_methods, /* tp_methods */
9888 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009889};
9890
9891static PyObject *
9892unicode_iter(PyObject *seq)
9893{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009894 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009895
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 if (!PyUnicode_Check(seq)) {
9897 PyErr_BadInternalCall();
9898 return NULL;
9899 }
9900 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9901 if (it == NULL)
9902 return NULL;
9903 it->it_index = 0;
9904 Py_INCREF(seq);
9905 it->it_seq = (PyUnicodeObject *)seq;
9906 _PyObject_GC_TRACK(it);
9907 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009908}
9909
Martin v. Löwis5b222132007-06-10 09:51:05 +00009910size_t
9911Py_UNICODE_strlen(const Py_UNICODE *u)
9912{
9913 int res = 0;
9914 while(*u++)
9915 res++;
9916 return res;
9917}
9918
9919Py_UNICODE*
9920Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9921{
9922 Py_UNICODE *u = s1;
9923 while ((*u++ = *s2++));
9924 return s1;
9925}
9926
9927Py_UNICODE*
9928Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9929{
9930 Py_UNICODE *u = s1;
9931 while ((*u++ = *s2++))
9932 if (n-- == 0)
9933 break;
9934 return s1;
9935}
9936
9937int
9938Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9939{
9940 while (*s1 && *s2 && *s1 == *s2)
9941 s1++, s2++;
9942 if (*s1 && *s2)
9943 return (*s1 < *s2) ? -1 : +1;
9944 if (*s1)
9945 return 1;
9946 if (*s2)
9947 return -1;
9948 return 0;
9949}
9950
9951Py_UNICODE*
9952Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9953{
9954 const Py_UNICODE *p;
9955 for (p = s; *p; p++)
9956 if (*p == c)
9957 return (Py_UNICODE*)p;
9958 return NULL;
9959}
9960
Victor Stinner331ea922010-08-10 16:37:20 +00009961Py_UNICODE*
9962Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
9963{
9964 const Py_UNICODE *p;
9965 p = s + Py_UNICODE_strlen(s);
9966 while (p != s) {
9967 p--;
9968 if (*p == c)
9969 return (Py_UNICODE*)p;
9970 }
9971 return NULL;
9972}
9973
Martin v. Löwis5b222132007-06-10 09:51:05 +00009974
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009975#ifdef __cplusplus
9976}
9977#endif