blob: 849f33e07604ecfb50924df5fe10c3cd0a074a0d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000310 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000313
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 return 0;
315}
316
317/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000318 Ux0000 terminated; some code (e.g. new_identifier)
319 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320
321 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
324*/
325
326static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000327PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328{
329 register PyUnicodeObject *unicode;
330
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 if (length == 0 && unicode_empty != NULL) {
333 Py_INCREF(unicode_empty);
334 return unicode_empty;
335 }
336
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000337 /* Ensure we won't overflow the size. */
338 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
339 return (PyUnicodeObject *)PyErr_NoMemory();
340 }
341
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000343 if (free_list) {
344 unicode = free_list;
345 free_list = *(PyUnicodeObject **)unicode;
346 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 if (unicode->str) {
348 /* Keep-Alive optimization: we only upsize the buffer,
349 never downsize it. */
350 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000351 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 PyObject_DEL(unicode->str);
353 unicode->str = NULL;
354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000359 }
360 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (unicode == NULL)
366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
368 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
370
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000371 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 PyErr_NoMemory();
373 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000374 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000376 * the caller fails before initializing str -- unicode_resize()
377 * reads str[0], and the Keep-Alive optimization can keep memory
378 * allocated for str alive across a call to unicode_dealloc(unicode).
379 * We don't want unicode_resize to read uninitialized memory in
380 * that case.
381 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000382 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000386 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000387 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000391 /* XXX UNREF/NEWREF interface should be more symmetrical */
392 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000393 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000394 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396}
397
398static
Guido van Rossum9475a232001-10-05 20:51:39 +0000399void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
Walter Dörwald16807132007-05-25 13:52:07 +0000401 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000402 case SSTATE_NOT_INTERNED:
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_MORTAL:
406 /* revive dead object temporarily for DelItem */
407 Py_REFCNT(unicode) = 3;
408 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
409 Py_FatalError(
410 "deletion of interned string failed");
411 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000412
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 case SSTATE_INTERNED_IMMORTAL:
414 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000415
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 default:
417 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000418 }
419
Guido van Rossum604ddf82001-12-06 20:03:56 +0000420 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000422 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000423 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
424 PyObject_DEL(unicode->str);
425 unicode->str = NULL;
426 unicode->length = 0;
427 }
428 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000429 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000430 }
431 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000432 *(PyUnicodeObject **)unicode = free_list;
433 free_list = unicode;
434 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyObject_DEL(unicode->str);
438 Py_XDECREF(unicode->defenc);
439 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 }
441}
442
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443static
444int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445{
446 register PyUnicodeObject *v;
447
448 /* Argument checks */
449 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 PyErr_BadInternalCall();
451 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000453 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000454 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 PyErr_BadInternalCall();
456 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
458
459 /* Resizing unicode_empty and single character objects is not
460 possible since these are being shared. We simply return a fresh
461 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000462 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 (v == unicode_empty || v->length == 1)) {
464 PyUnicodeObject *w = _PyUnicode_New(length);
465 if (w == NULL)
466 return -1;
467 Py_UNICODE_COPY(w->str, v->str,
468 length < v->length ? length : v->length);
469 Py_DECREF(*unicode);
470 *unicode = w;
471 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
473
474 /* Note that we don't have to modify *unicode for unshared Unicode
475 objects, since we can modify them in-place. */
476 return unicode_resize(v, length);
477}
478
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000479int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
480{
481 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
482}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486{
487 PyUnicodeObject *unicode;
488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000489 /* If the Unicode data is known at construction time, we can apply
490 some optimizations which share commonly used objects. */
491 if (u != NULL) {
492
Benjamin Peterson29060642009-01-31 22:14:21 +0000493 /* Optimization for empty strings */
494 if (size == 0 && unicode_empty != NULL) {
495 Py_INCREF(unicode_empty);
496 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000497 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000498
499 /* Single character Unicode objects in the Latin-1 range are
500 shared when using this constructor */
501 if (size == 1 && *u < 256) {
502 unicode = unicode_latin1[*u];
503 if (!unicode) {
504 unicode = _PyUnicode_New(1);
505 if (!unicode)
506 return NULL;
507 unicode->str[0] = *u;
508 unicode_latin1[*u] = unicode;
509 }
510 Py_INCREF(unicode);
511 return (PyObject *)unicode;
512 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518
519 /* Copy the Unicode data into the new object */
520 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522
523 return (PyObject *)unicode;
524}
525
Walter Dörwaldd2034312007-05-18 16:29:38 +0000526PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527{
528 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 if (size < 0) {
531 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 return NULL;
534 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000535
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000537 some optimizations which share commonly used objects.
538 Also, this means the input must be UTF-8, so fall back to the
539 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 if (u != NULL) {
541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542 /* Optimization for empty strings */
543 if (size == 0 && unicode_empty != NULL) {
544 Py_INCREF(unicode_empty);
545 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000546 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000547
548 /* Single characters are shared when using this constructor.
549 Restrict to ASCII, since the input must be UTF-8. */
550 if (size == 1 && Py_CHARMASK(*u) < 128) {
551 unicode = unicode_latin1[Py_CHARMASK(*u)];
552 if (!unicode) {
553 unicode = _PyUnicode_New(1);
554 if (!unicode)
555 return NULL;
556 unicode->str[0] = Py_CHARMASK(*u);
557 unicode_latin1[Py_CHARMASK(*u)] = unicode;
558 }
559 Py_INCREF(unicode);
560 return (PyObject *)unicode;
561 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000562
563 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 }
565
Walter Dörwald55507312007-05-18 13:12:10 +0000566 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 if (!unicode)
568 return NULL;
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570 return (PyObject *)unicode;
571}
572
Walter Dörwaldd2034312007-05-18 16:29:38 +0000573PyObject *PyUnicode_FromString(const char *u)
574{
575 size_t size = strlen(u);
576 if (size > PY_SSIZE_T_MAX) {
577 PyErr_SetString(PyExc_OverflowError, "input too long");
578 return NULL;
579 }
580
581 return PyUnicode_FromStringAndSize(u, size);
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584#ifdef HAVE_WCHAR_H
585
Mark Dickinson081dfee2009-03-18 14:47:41 +0000586#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
587# define CONVERT_WCHAR_TO_SURROGATES
588#endif
589
590#ifdef CONVERT_WCHAR_TO_SURROGATES
591
592/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
593 to convert from UTF32 to UTF16. */
594
595PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
596 Py_ssize_t size)
597{
598 PyUnicodeObject *unicode;
599 register Py_ssize_t i;
600 Py_ssize_t alloc;
601 const wchar_t *orig_w;
602
603 if (w == NULL) {
604 if (size == 0)
605 return PyUnicode_FromStringAndSize(NULL, 0);
606 PyErr_BadInternalCall();
607 return NULL;
608 }
609
610 if (size == -1) {
611 size = wcslen(w);
612 }
613
614 alloc = size;
615 orig_w = w;
616 for (i = size; i > 0; i--) {
617 if (*w > 0xFFFF)
618 alloc++;
619 w++;
620 }
621 w = orig_w;
622 unicode = _PyUnicode_New(alloc);
623 if (!unicode)
624 return NULL;
625
626 /* Copy the wchar_t data into the new object */
627 {
628 register Py_UNICODE *u;
629 u = PyUnicode_AS_UNICODE(unicode);
630 for (i = size; i > 0; i--) {
631 if (*w > 0xFFFF) {
632 wchar_t ordinal = *w++;
633 ordinal -= 0x10000;
634 *u++ = 0xD800 | (ordinal >> 10);
635 *u++ = 0xDC00 | (ordinal & 0x3FF);
636 }
637 else
638 *u++ = *w++;
639 }
640 }
641 return (PyObject *)unicode;
642}
643
644#else
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000647 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648{
649 PyUnicodeObject *unicode;
650
651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == 0)
653 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 PyErr_BadInternalCall();
655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 }
657
Martin v. Löwis790465f2008-04-05 20:41:37 +0000658 if (size == -1) {
659 size = wcslen(w);
660 }
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = _PyUnicode_New(size);
663 if (!unicode)
664 return NULL;
665
666 /* Copy the wchar_t data into the new object */
667#ifdef HAVE_USABLE_WCHAR_T
668 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000669#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 register Py_UNICODE *u;
672 register Py_ssize_t i;
673 u = PyUnicode_AS_UNICODE(unicode);
674 for (i = size; i > 0; i--)
675 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000676 }
677#endif
678
679 return (PyObject *)unicode;
680}
681
Mark Dickinson081dfee2009-03-18 14:47:41 +0000682#endif /* CONVERT_WCHAR_TO_SURROGATES */
683
684#undef CONVERT_WCHAR_TO_SURROGATES
685
Walter Dörwald346737f2007-05-31 10:44:43 +0000686static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000687makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
688 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000689{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000690 *fmt++ = '%';
691 if (width) {
692 if (zeropad)
693 *fmt++ = '0';
694 fmt += sprintf(fmt, "%d", width);
695 }
696 if (precision)
697 fmt += sprintf(fmt, ".%d", precision);
698 if (longflag)
699 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000700 else if (longlongflag) {
701 /* longlongflag should only ever be nonzero on machines with
702 HAVE_LONG_LONG defined */
703#ifdef HAVE_LONG_LONG
704 char *f = PY_FORMAT_LONG_LONG;
705 while (*f)
706 *fmt++ = *f++;
707#else
708 /* we shouldn't ever get here */
709 assert(0);
710 *fmt++ = 'l';
711#endif
712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 else if (size_tflag) {
714 char *f = PY_FORMAT_SIZE_T;
715 while (*f)
716 *fmt++ = *f++;
717 }
718 *fmt++ = c;
719 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000720}
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
723
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000724/* size of fixed-size buffer for formatting single arguments */
725#define ITEM_BUFFER_LEN 21
726/* maximum number of characters required for output of %ld. 21 characters
727 allows for 64-bit integers (in decimal) and an optional sign. */
728#define MAX_LONG_CHARS 21
729/* maximum number of characters required for output of %lld.
730 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
731 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
732#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
733
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734PyObject *
735PyUnicode_FromFormatV(const char *format, va_list vargs)
736{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000737 va_list count;
738 Py_ssize_t callcount = 0;
739 PyObject **callresults = NULL;
740 PyObject **callresult = NULL;
741 Py_ssize_t n = 0;
742 int width = 0;
743 int precision = 0;
744 int zeropad;
745 const char* f;
746 Py_UNICODE *s;
747 PyObject *string;
748 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 /* use abuffer instead of buffer, if we need more space
751 * (which can happen if there's a format specifier with width). */
752 char *abuffer = NULL;
753 char *realbuffer;
754 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000755 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000758 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 /* step 1: count the number of %S/%R/%A/%s format specifications
760 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
761 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
762 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000763 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000764 if (*f == '%') {
765 if (*(f+1)=='%')
766 continue;
767 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
768 ++callcount;
769 while (ISDIGIT((unsigned)*f))
770 width = (width*10) + *f++ - '0';
771 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
772 ;
773 if (*f == 's')
774 ++callcount;
775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 }
777 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000778 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000779 if (callcount) {
780 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
781 if (!callresults) {
782 PyErr_NoMemory();
783 return NULL;
784 }
785 callresult = callresults;
786 }
787 /* step 3: figure out how large a buffer we need */
788 for (f = format; *f; f++) {
789 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000790#ifdef HAVE_LONG_LONG
791 int longlongflag = 0;
792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 const char* p = f;
794 width = 0;
795 while (ISDIGIT((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
798 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
801 * they don't affect the amount of space we reserve.
802 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000803 if (*f == 'l') {
804 if (f[1] == 'd' || f[1] == 'u') {
805 ++f;
806 }
807#ifdef HAVE_LONG_LONG
808 else if (f[1] == 'l' &&
809 (f[2] == 'd' || f[2] == 'u')) {
810 longlongflag = 1;
811 f += 2;
812 }
813#endif
814 }
815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000816 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Benjamin Peterson14339b62009-01-31 16:36:08 +0000819 switch (*f) {
820 case 'c':
821 (void)va_arg(count, int);
822 /* fall through... */
823 case '%':
824 n++;
825 break;
826 case 'd': case 'u': case 'i': case 'x':
827 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000828#ifdef HAVE_LONG_LONG
829 if (longlongflag) {
830 if (width < MAX_LONG_LONG_CHARS)
831 width = MAX_LONG_LONG_CHARS;
832 }
833 else
834#endif
835 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
836 including sign. Decimal takes the most space. This
837 isn't enough for octal. If a width is specified we
838 need more (which we allocate later). */
839 if (width < MAX_LONG_CHARS)
840 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000842 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 if (abuffersize < width)
844 abuffersize = width;
845 break;
846 case 's':
847 {
848 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000849 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000850 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
851 if (!str)
852 goto fail;
853 n += PyUnicode_GET_SIZE(str);
854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 break;
857 }
858 case 'U':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 assert(obj && PyUnicode_Check(obj));
862 n += PyUnicode_GET_SIZE(obj);
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(count, PyObject *);
868 const char *str = va_arg(count, const char *);
869 assert(obj || str);
870 assert(!obj || PyUnicode_Check(obj));
871 if (obj)
872 n += PyUnicode_GET_SIZE(obj);
873 else
874 n += strlen(str);
875 break;
876 }
877 case 'S':
878 {
879 PyObject *obj = va_arg(count, PyObject *);
880 PyObject *str;
881 assert(obj);
882 str = PyObject_Str(obj);
883 if (!str)
884 goto fail;
885 n += PyUnicode_GET_SIZE(str);
886 /* Remember the str and switch to the next slot */
887 *callresult++ = str;
888 break;
889 }
890 case 'R':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *repr;
894 assert(obj);
895 repr = PyObject_Repr(obj);
896 if (!repr)
897 goto fail;
898 n += PyUnicode_GET_SIZE(repr);
899 /* Remember the repr and switch to the next slot */
900 *callresult++ = repr;
901 break;
902 }
903 case 'A':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *ascii;
907 assert(obj);
908 ascii = PyObject_ASCII(obj);
909 if (!ascii)
910 goto fail;
911 n += PyUnicode_GET_SIZE(ascii);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = ascii;
914 break;
915 }
916 case 'p':
917 (void) va_arg(count, int);
918 /* maximum 64-bit pointer representation:
919 * 0xffffffffffffffff
920 * so 19 characters is enough.
921 * XXX I count 18 -- what's the extra for?
922 */
923 n += 19;
924 break;
925 default:
926 /* if we stumble upon an unknown
927 formatting code, copy the rest of
928 the format string to the output
929 string. (we cannot just skip the
930 code, since there's no way to know
931 what's in the argument list) */
932 n += strlen(p);
933 goto expand;
934 }
935 } else
936 n++;
937 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000939 if (abuffersize > ITEM_BUFFER_LEN) {
940 /* add 1 for sprintf's trailing null byte */
941 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000942 if (!abuffer) {
943 PyErr_NoMemory();
944 goto fail;
945 }
946 realbuffer = abuffer;
947 }
948 else
949 realbuffer = buffer;
950 /* step 4: fill the buffer */
951 /* Since we've analyzed how much space we need for the worst case,
952 we don't have to resize the string.
953 There can be no errors beyond this point. */
954 string = PyUnicode_FromUnicode(NULL, n);
955 if (!string)
956 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 s = PyUnicode_AS_UNICODE(string);
959 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 for (f = format; *f; f++) {
962 if (*f == '%') {
963 const char* p = f++;
964 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000965 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 int size_tflag = 0;
967 zeropad = (*f == '0');
968 /* parse the width.precision part */
969 width = 0;
970 while (ISDIGIT((unsigned)*f))
971 width = (width*10) + *f++ - '0';
972 precision = 0;
973 if (*f == '.') {
974 f++;
975 while (ISDIGIT((unsigned)*f))
976 precision = (precision*10) + *f++ - '0';
977 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 /* Handle %ld, %lu, %lld and %llu. */
979 if (*f == 'l') {
980 if (f[1] == 'd' || f[1] == 'u') {
981 longflag = 1;
982 ++f;
983 }
984#ifdef HAVE_LONG_LONG
985 else if (f[1] == 'l' &&
986 (f[2] == 'd' || f[2] == 'u')) {
987 longlongflag = 1;
988 f += 2;
989 }
990#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000991 }
992 /* handle the size_t flag. */
993 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
994 size_tflag = 1;
995 ++f;
996 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000997
Benjamin Peterson14339b62009-01-31 16:36:08 +0000998 switch (*f) {
999 case 'c':
1000 *s++ = va_arg(vargs, int);
1001 break;
1002 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1004 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 if (longflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007#ifdef HAVE_LONG_LONG
1008 else if (longlongflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 else if (size_tflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1013 else
1014 sprintf(realbuffer, fmt, va_arg(vargs, int));
1015 appendstring(realbuffer);
1016 break;
1017 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001018 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1019 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 if (longflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022#ifdef HAVE_LONG_LONG
1023 else if (longlongflag)
1024 sprintf(realbuffer, fmt, va_arg(vargs,
1025 unsigned PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001039 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001040 sprintf(realbuffer, fmt, va_arg(vargs, int));
1041 appendstring(realbuffer);
1042 break;
1043 case 's':
1044 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001045 /* unused, since we already have the result */
1046 (void) va_arg(vargs, char *);
1047 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1048 PyUnicode_GET_SIZE(*callresult));
1049 s += PyUnicode_GET_SIZE(*callresult);
1050 /* We're done with the unicode()/repr() => forget it */
1051 Py_DECREF(*callresult);
1052 /* switch to next unicode()/repr() result */
1053 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 break;
1055 }
1056 case 'U':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1060 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1061 s += size;
1062 break;
1063 }
1064 case 'V':
1065 {
1066 PyObject *obj = va_arg(vargs, PyObject *);
1067 const char *str = va_arg(vargs, const char *);
1068 if (obj) {
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 } else {
1073 appendstring(str);
1074 }
1075 break;
1076 }
1077 case 'S':
1078 case 'R':
1079 {
1080 Py_UNICODE *ucopy;
1081 Py_ssize_t usize;
1082 Py_ssize_t upos;
1083 /* unused, since we already have the result */
1084 (void) va_arg(vargs, PyObject *);
1085 ucopy = PyUnicode_AS_UNICODE(*callresult);
1086 usize = PyUnicode_GET_SIZE(*callresult);
1087 for (upos = 0; upos<usize;)
1088 *s++ = ucopy[upos++];
1089 /* We're done with the unicode()/repr() => forget it */
1090 Py_DECREF(*callresult);
1091 /* switch to next unicode()/repr() result */
1092 ++callresult;
1093 break;
1094 }
1095 case 'p':
1096 sprintf(buffer, "%p", va_arg(vargs, void*));
1097 /* %p is ill-defined: ensure leading 0x. */
1098 if (buffer[1] == 'X')
1099 buffer[1] = 'x';
1100 else if (buffer[1] != 'x') {
1101 memmove(buffer+2, buffer, strlen(buffer)+1);
1102 buffer[0] = '0';
1103 buffer[1] = 'x';
1104 }
1105 appendstring(buffer);
1106 break;
1107 case '%':
1108 *s++ = '%';
1109 break;
1110 default:
1111 appendstring(p);
1112 goto end;
1113 }
1114 } else
1115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 wchar_t *w,
1159 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160{
1161 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 PyErr_BadInternalCall();
1163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001165
1166 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170#ifdef HAVE_USABLE_WCHAR_T
1171 memcpy(w, unicode->str, size * sizeof(wchar_t));
1172#else
1173 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 register Py_UNICODE *u;
1175 register Py_ssize_t i;
1176 u = PyUnicode_AS_UNICODE(unicode);
1177 for (i = size; i > 0; i--)
1178 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180#endif
1181
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001182 if (size > PyUnicode_GET_SIZE(unicode))
1183 return PyUnicode_GET_SIZE(unicode);
1184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186}
1187
1188#endif
1189
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190PyObject *PyUnicode_FromOrdinal(int ordinal)
1191{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001192 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 PyErr_SetString(PyExc_ValueError,
1196 "chr() arg not in range(0x110000)");
1197 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001199
1200#ifndef Py_UNICODE_WIDE
1201 if (ordinal > 0xffff) {
1202 ordinal -= 0x10000;
1203 s[0] = 0xD800 | (ordinal >> 10);
1204 s[1] = 0xDC00 | (ordinal & 0x3FF);
1205 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001206 }
1207#endif
1208
Hye-Shik Chang40574832004-04-06 07:24:51 +00001209 s[0] = (Py_UNICODE)ordinal;
1210 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_FromObject(register PyObject *obj)
1214{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001215 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 Py_INCREF(obj);
1219 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001220 }
1221 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 /* For a Unicode subtype that's not a Unicode object,
1223 return a true Unicode object with the same data. */
1224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1225 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001227 PyErr_Format(PyExc_TypeError,
1228 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001229 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001230 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231}
1232
1233PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 const char *encoding,
1235 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001237 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001238 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001239 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001240
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001242 PyErr_BadInternalCall();
1243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001245
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001246 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001247 PyErr_SetString(PyExc_TypeError,
1248 "decoding str is not supported");
1249 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001250 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001251
1252 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001253 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001254 s = PyBytes_AS_STRING(obj);
1255 len = PyBytes_GET_SIZE(obj);
1256 }
1257 else if (PyByteArray_Check(obj)) {
1258 s = PyByteArray_AS_STRING(obj);
1259 len = PyByteArray_GET_SIZE(obj);
1260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 /* Overwrite the error message with something more useful in
1263 case of a TypeError. */
1264 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001265 PyErr_Format(PyExc_TypeError,
Georg Brandl952867a2010-06-27 10:17:12 +00001266 "coercing to str: need bytes, bytearray or char buffer, "
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 "%.80s found",
1268 Py_TYPE(obj)->tp_name);
1269 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001270 }
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001272 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 Py_INCREF(unicode_empty);
1275 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Tim Petersced69f82003-09-16 20:30:58 +00001277 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001279
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001280 return v;
1281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284}
1285
Victor Stinner600d3be2010-06-10 12:00:55 +00001286/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001287 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1288 1 on success. */
1289static int
1290normalize_encoding(const char *encoding,
1291 char *lower,
1292 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001294 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001295 char *l;
1296 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001297
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001298 e = encoding;
1299 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001300 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001301 while (*e) {
1302 if (l == l_end)
1303 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 if (ISUPPER(*e)) {
1305 *l++ = TOLOWER(*e++);
1306 }
1307 else if (*e == '_') {
1308 *l++ = '-';
1309 e++;
1310 }
1311 else {
1312 *l++ = *e++;
1313 }
1314 }
1315 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001316 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001317}
1318
1319PyObject *PyUnicode_Decode(const char *s,
1320 Py_ssize_t size,
1321 const char *encoding,
1322 const char *errors)
1323{
1324 PyObject *buffer = NULL, *unicode;
1325 Py_buffer info;
1326 char lower[11]; /* Enough for any encoding shortcut */
1327
1328 if (encoding == NULL)
1329 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001330
1331 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001332 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1333 if (strcmp(lower, "utf-8") == 0)
1334 return PyUnicode_DecodeUTF8(s, size, errors);
1335 else if ((strcmp(lower, "latin-1") == 0) ||
1336 (strcmp(lower, "iso-8859-1") == 0))
1337 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001339 else if (strcmp(lower, "mbcs") == 0)
1340 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001341#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001342 else if (strcmp(lower, "ascii") == 0)
1343 return PyUnicode_DecodeASCII(s, size, errors);
1344 else if (strcmp(lower, "utf-16") == 0)
1345 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1346 else if (strcmp(lower, "utf-32") == 0)
1347 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
1350 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001351 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001352 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001353 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001354 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 if (buffer == NULL)
1356 goto onError;
1357 unicode = PyCodec_Decode(buffer, encoding, errors);
1358 if (unicode == NULL)
1359 goto onError;
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001362 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 Py_DECREF(unicode);
1365 goto onError;
1366 }
1367 Py_DECREF(buffer);
1368 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001369
Benjamin Peterson29060642009-01-31 22:14:21 +00001370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 Py_XDECREF(buffer);
1372 return NULL;
1373}
1374
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1376 const char *encoding,
1377 const char *errors)
1378{
1379 PyObject *v;
1380
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385
1386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001388
1389 /* Decode via the codec registry */
1390 v = PyCodec_Decode(unicode, encoding, errors);
1391 if (v == NULL)
1392 goto onError;
1393 return v;
1394
Benjamin Peterson29060642009-01-31 22:14:21 +00001395 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001396 return NULL;
1397}
1398
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001411 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001412
1413 /* Decode via the codec registry */
1414 v = PyCodec_Decode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
1424 return v;
1425
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001427 return NULL;
1428}
1429
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 Py_ssize_t size,
1432 const char *encoding,
1433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434{
1435 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001436
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 unicode = PyUnicode_FromUnicode(s, size);
1438 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1441 Py_DECREF(unicode);
1442 return v;
1443}
1444
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001445PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1446 const char *encoding,
1447 const char *errors)
1448{
1449 PyObject *v;
1450
1451 if (!PyUnicode_Check(unicode)) {
1452 PyErr_BadArgument();
1453 goto onError;
1454 }
1455
1456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001458
1459 /* Encode via the codec registry */
1460 v = PyCodec_Encode(unicode, encoding, errors);
1461 if (v == NULL)
1462 goto onError;
1463 return v;
1464
Benjamin Peterson29060642009-01-31 22:14:21 +00001465 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001466 return NULL;
1467}
1468
Victor Stinnerae6265f2010-05-15 16:27:27 +00001469PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1470{
Victor Stinner313a1202010-06-11 23:56:51 +00001471 if (Py_FileSystemDefaultEncoding) {
1472#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1473 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1474 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode),
1476 NULL);
1477#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001478 return PyUnicode_AsEncodedString(unicode,
1479 Py_FileSystemDefaultEncoding,
1480 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001481 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001482 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1483 PyUnicode_GET_SIZE(unicode),
1484 "surrogateescape");
1485}
1486
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1488 const char *encoding,
1489 const char *errors)
1490{
1491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001493
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 if (!PyUnicode_Check(unicode)) {
1495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 }
Fred Drakee4315f52000-05-09 19:53:39 +00001498
Tim Petersced69f82003-09-16 20:30:58 +00001499 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001501
1502 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001503 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1504 if (strcmp(lower, "utf-8") == 0)
1505 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1506 PyUnicode_GET_SIZE(unicode),
1507 errors);
1508 else if ((strcmp(lower, "latin-1") == 0) ||
1509 (strcmp(lower, "iso-8859-1") == 0))
1510 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1511 PyUnicode_GET_SIZE(unicode),
1512 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001513#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001514 else if (strcmp(lower, "mbcs") == 0)
1515 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001518#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001519 else if (strcmp(lower, "ascii") == 0)
1520 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
1523 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001524 /* During bootstrap, we may need to find the encodings
1525 package, to load the file system encoding, and require the
1526 file system encoding in order to load the encodings
1527 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001528
Victor Stinner59e62db2010-05-15 13:14:32 +00001529 Break out of this dependency by assuming that the path to
1530 the encodings module is ASCII-only. XXX could try wcstombs
1531 instead, if the file system encoding is the locale's
1532 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001533 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1535 !PyThreadState_GET()->interp->codecs_initialized)
1536 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1537 PyUnicode_GET_SIZE(unicode),
1538 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539
1540 /* Encode via the codec registry */
1541 v = PyCodec_Encode(unicode, encoding, errors);
1542 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001543 return NULL;
1544
1545 /* The normal path */
1546 if (PyBytes_Check(v))
1547 return v;
1548
1549 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001550 if (PyByteArray_Check(v)) {
1551 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001552 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001553 PyOS_snprintf(msg, sizeof(msg),
1554 "encoder %s returned buffer instead of bytes",
1555 encoding);
1556 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001557 Py_DECREF(v);
1558 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001561 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1562 Py_DECREF(v);
1563 return b;
1564 }
1565
1566 PyErr_Format(PyExc_TypeError,
1567 "encoder did not return a bytes object (type=%.400s)",
1568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570 return NULL;
1571}
1572
1573PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1574 const char *encoding,
1575 const char *errors)
1576{
1577 PyObject *v;
1578
1579 if (!PyUnicode_Check(unicode)) {
1580 PyErr_BadArgument();
1581 goto onError;
1582 }
1583
1584 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001586
1587 /* Encode via the codec registry */
1588 v = PyCodec_Encode(unicode, encoding, errors);
1589 if (v == NULL)
1590 goto onError;
1591 if (!PyUnicode_Check(v)) {
1592 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001593 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001594 Py_TYPE(v)->tp_name);
1595 Py_DECREF(v);
1596 goto onError;
1597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001599
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 return NULL;
1602}
1603
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001604PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001606{
1607 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001608 if (v)
1609 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001610 if (errors != NULL)
1611 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001612 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001613 PyUnicode_GET_SIZE(unicode),
1614 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001615 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001616 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001617 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 return v;
1619}
1620
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001622PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001623 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001624 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1625}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001626
Christian Heimes5894ba72007-11-04 11:43:14 +00001627PyObject*
1628PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1629{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1631 can be undefined. If it is case, decode using UTF-8. The following assumes
1632 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1633 bootstrapping process where the codecs aren't ready yet.
1634 */
1635 if (Py_FileSystemDefaultEncoding) {
1636#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001637 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001638 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001639 }
1640#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001641 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001642 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001643 }
1644#endif
1645 return PyUnicode_Decode(s, size,
1646 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001647 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001648 }
1649 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001650 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001651 }
1652}
1653
Martin v. Löwis011e8422009-05-05 04:43:17 +00001654/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001655 system encoding. The addr param must be a PyObject**.
1656 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001657
1658int
1659PyUnicode_FSConverter(PyObject* arg, void* addr)
1660{
1661 PyObject *output = NULL;
1662 Py_ssize_t size;
1663 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001664 if (arg == NULL) {
1665 Py_DECREF(*(PyObject**)addr);
1666 return 1;
1667 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001668 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001669 output = arg;
1670 Py_INCREF(output);
1671 }
1672 else {
1673 arg = PyUnicode_FromObject(arg);
1674 if (!arg)
1675 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001676 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001677 Py_DECREF(arg);
1678 if (!output)
1679 return 0;
1680 if (!PyBytes_Check(output)) {
1681 Py_DECREF(output);
1682 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1683 return 0;
1684 }
1685 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001686 size = PyBytes_GET_SIZE(output);
1687 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001688 if (size != strlen(data)) {
1689 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1690 Py_DECREF(output);
1691 return 0;
1692 }
1693 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001694 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001695}
1696
1697
Martin v. Löwis5b222132007-06-10 09:51:05 +00001698char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001699_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001700{
Christian Heimesf3863112007-11-22 07:46:41 +00001701 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001702 if (!PyUnicode_Check(unicode)) {
1703 PyErr_BadArgument();
1704 return NULL;
1705 }
Christian Heimesf3863112007-11-22 07:46:41 +00001706 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1707 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001708 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001709 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001710 *psize = PyBytes_GET_SIZE(bytes);
1711 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001712}
1713
1714char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001715_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001716{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001717 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001718}
1719
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1721{
1722 if (!PyUnicode_Check(unicode)) {
1723 PyErr_BadArgument();
1724 goto onError;
1725 }
1726 return PyUnicode_AS_UNICODE(unicode);
1727
Benjamin Peterson29060642009-01-31 22:14:21 +00001728 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 return NULL;
1730}
1731
Martin v. Löwis18e16552006-02-15 17:27:45 +00001732Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733{
1734 if (!PyUnicode_Check(unicode)) {
1735 PyErr_BadArgument();
1736 goto onError;
1737 }
1738 return PyUnicode_GET_SIZE(unicode);
1739
Benjamin Peterson29060642009-01-31 22:14:21 +00001740 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 return -1;
1742}
1743
Thomas Wouters78890102000-07-22 19:25:51 +00001744const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001745{
1746 return unicode_default_encoding;
1747}
1748
1749int PyUnicode_SetDefaultEncoding(const char *encoding)
1750{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001751 if (strcmp(encoding, unicode_default_encoding) != 0) {
1752 PyErr_Format(PyExc_ValueError,
1753 "Can only set default encoding to %s",
1754 unicode_default_encoding);
1755 return -1;
1756 }
Fred Drakee4315f52000-05-09 19:53:39 +00001757 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001758}
1759
Victor Stinner554f3f02010-06-16 23:33:54 +00001760/* create or adjust a UnicodeDecodeError */
1761static void
1762make_decode_exception(PyObject **exceptionObject,
1763 const char *encoding,
1764 const char *input, Py_ssize_t length,
1765 Py_ssize_t startpos, Py_ssize_t endpos,
1766 const char *reason)
1767{
1768 if (*exceptionObject == NULL) {
1769 *exceptionObject = PyUnicodeDecodeError_Create(
1770 encoding, input, length, startpos, endpos, reason);
1771 }
1772 else {
1773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1774 goto onError;
1775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1776 goto onError;
1777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1778 goto onError;
1779 }
1780 return;
1781
1782onError:
1783 Py_DECREF(*exceptionObject);
1784 *exceptionObject = NULL;
1785}
1786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787/* error handling callback helper:
1788 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001789 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 and adjust various state variables.
1791 return 0 on success, -1 on error
1792*/
1793
1794static
1795int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 const char *encoding, const char *reason,
1797 const char **input, const char **inend, Py_ssize_t *startinpos,
1798 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1799 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001801 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802
1803 PyObject *restuple = NULL;
1804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001806 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001807 Py_ssize_t requiredsize;
1808 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001810 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001811 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 int res = -1;
1813
1814 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 *errorHandler = PyCodec_LookupError(errors);
1816 if (*errorHandler == NULL)
1817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 }
1819
Victor Stinner554f3f02010-06-16 23:33:54 +00001820 make_decode_exception(exceptionObject,
1821 encoding,
1822 *input, *inend - *input,
1823 *startinpos, *endinpos,
1824 reason);
1825 if (*exceptionObject == NULL)
1826 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827
1828 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1829 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001832 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001833 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 }
1835 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001837
1838 /* Copy back the bytes variables, which might have been modified by the
1839 callback */
1840 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1841 if (!inputobj)
1842 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001843 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001844 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001845 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001846 *input = PyBytes_AS_STRING(inputobj);
1847 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001848 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001849 /* we can DECREF safely, as the exception has another reference,
1850 so the object won't go away. */
1851 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001855 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1857 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001858 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859
1860 /* need more space? (at least enough for what we
1861 have+the replacement+the rest of the string (starting
1862 at the new input position), so we won't have to check space
1863 when there are no errors in the rest of the string) */
1864 repptr = PyUnicode_AS_UNICODE(repunicode);
1865 repsize = PyUnicode_GET_SIZE(repunicode);
1866 requiredsize = *outpos + repsize + insize-newpos;
1867 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001868 if (requiredsize<2*outsize)
1869 requiredsize = 2*outsize;
1870 if (_PyUnicode_Resize(output, requiredsize) < 0)
1871 goto onError;
1872 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 }
1874 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001875 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 Py_UNICODE_COPY(*outptr, repptr, repsize);
1877 *outptr += repsize;
1878 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 /* we made it! */
1881 res = 0;
1882
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 Py_XDECREF(restuple);
1885 return res;
1886}
1887
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888/* --- UTF-7 Codec -------------------------------------------------------- */
1889
Antoine Pitrou244651a2009-05-04 18:56:13 +00001890/* See RFC2152 for details. We encode conservatively and decode liberally. */
1891
1892/* Three simple macros defining base-64. */
1893
1894/* Is c a base-64 character? */
1895
1896#define IS_BASE64(c) \
1897 (((c) >= 'A' && (c) <= 'Z') || \
1898 ((c) >= 'a' && (c) <= 'z') || \
1899 ((c) >= '0' && (c) <= '9') || \
1900 (c) == '+' || (c) == '/')
1901
1902/* given that c is a base-64 character, what is its base-64 value? */
1903
1904#define FROM_BASE64(c) \
1905 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1906 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1907 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1908 (c) == '+' ? 62 : 63)
1909
1910/* What is the base-64 character of the bottom 6 bits of n? */
1911
1912#define TO_BASE64(n) \
1913 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1914
1915/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1916 * decoded as itself. We are permissive on decoding; the only ASCII
1917 * byte not decoding to itself is the + which begins a base64
1918 * string. */
1919
1920#define DECODE_DIRECT(c) \
1921 ((c) <= 127 && (c) != '+')
1922
1923/* The UTF-7 encoder treats ASCII characters differently according to
1924 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1925 * the above). See RFC2152. This array identifies these different
1926 * sets:
1927 * 0 : "Set D"
1928 * alphanumeric and '(),-./:?
1929 * 1 : "Set O"
1930 * !"#$%&*;<=>@[]^_`{|}
1931 * 2 : "whitespace"
1932 * ht nl cr sp
1933 * 3 : special (must be base64 encoded)
1934 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1935 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936
Tim Petersced69f82003-09-16 20:30:58 +00001937static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001938char utf7_category[128] = {
1939/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1940 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1941/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1942 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1943/* sp ! " # $ % & ' ( ) * + , - . / */
1944 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1945/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1947/* @ A B C D E F G H I J K L M N O */
1948 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1949/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1951/* ` a b c d e f g h i j k l m n o */
1952 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1953/* p q r s t u v w x y z { | } ~ del */
1954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955};
1956
Antoine Pitrou244651a2009-05-04 18:56:13 +00001957/* ENCODE_DIRECT: this character should be encoded as itself. The
1958 * answer depends on whether we are encoding set O as itself, and also
1959 * on whether we are encoding whitespace as itself. RFC2152 makes it
1960 * clear that the answers to these questions vary between
1961 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001962
Antoine Pitrou244651a2009-05-04 18:56:13 +00001963#define ENCODE_DIRECT(c, directO, directWS) \
1964 ((c) < 128 && (c) > 0 && \
1965 ((utf7_category[(c)] == 0) || \
1966 (directWS && (utf7_category[(c)] == 2)) || \
1967 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001970 Py_ssize_t size,
1971 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001973 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1974}
1975
Antoine Pitrou244651a2009-05-04 18:56:13 +00001976/* The decoder. The only state we preserve is our read position,
1977 * i.e. how many characters we have consumed. So if we end in the
1978 * middle of a shift sequence we have to back off the read position
1979 * and the output to the beginning of the sequence, otherwise we lose
1980 * all the shift state (seen bits, number of bits seen, high
1981 * surrogate). */
1982
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001983PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001984 Py_ssize_t size,
1985 const char *errors,
1986 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001987{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001988 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001989 Py_ssize_t startinpos;
1990 Py_ssize_t endinpos;
1991 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001992 const char *e;
1993 PyUnicodeObject *unicode;
1994 Py_UNICODE *p;
1995 const char *errmsg = "";
1996 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001997 Py_UNICODE *shiftOutStart;
1998 unsigned int base64bits = 0;
1999 unsigned long base64buffer = 0;
2000 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001 PyObject *errorHandler = NULL;
2002 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002003
2004 unicode = _PyUnicode_New(size);
2005 if (!unicode)
2006 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002007 if (size == 0) {
2008 if (consumed)
2009 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002011 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002012
2013 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002015 e = s + size;
2016
2017 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002020 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021
Antoine Pitrou244651a2009-05-04 18:56:13 +00002022 if (inShift) { /* in a base-64 section */
2023 if (IS_BASE64(ch)) { /* consume a base-64 character */
2024 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2025 base64bits += 6;
2026 s++;
2027 if (base64bits >= 16) {
2028 /* we have enough bits for a UTF-16 value */
2029 Py_UNICODE outCh = (Py_UNICODE)
2030 (base64buffer >> (base64bits-16));
2031 base64bits -= 16;
2032 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2033 if (surrogate) {
2034 /* expecting a second surrogate */
2035 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2036#ifdef Py_UNICODE_WIDE
2037 *p++ = (((surrogate & 0x3FF)<<10)
2038 | (outCh & 0x3FF)) + 0x10000;
2039#else
2040 *p++ = surrogate;
2041 *p++ = outCh;
2042#endif
2043 surrogate = 0;
2044 }
2045 else {
2046 surrogate = 0;
2047 errmsg = "second surrogate missing";
2048 goto utf7Error;
2049 }
2050 }
2051 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2052 /* first surrogate */
2053 surrogate = outCh;
2054 }
2055 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2056 errmsg = "unexpected second surrogate";
2057 goto utf7Error;
2058 }
2059 else {
2060 *p++ = outCh;
2061 }
2062 }
2063 }
2064 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065 inShift = 0;
2066 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067 if (surrogate) {
2068 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002069 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002070 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002071 if (base64bits > 0) { /* left-over bits */
2072 if (base64bits >= 6) {
2073 /* We've seen at least one base-64 character */
2074 errmsg = "partial character in shift sequence";
2075 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002076 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002077 else {
2078 /* Some bits remain; they should be zero */
2079 if (base64buffer != 0) {
2080 errmsg = "non-zero padding bits in shift sequence";
2081 goto utf7Error;
2082 }
2083 }
2084 }
2085 if (ch != '-') {
2086 /* '-' is absorbed; other terminating
2087 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 *p++ = ch;
2089 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 }
2091 }
2092 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002094 s++; /* consume '+' */
2095 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002096 s++;
2097 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002098 }
2099 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002100 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002101 shiftOutStart = p;
2102 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002103 }
2104 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002105 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002106 *p++ = ch;
2107 s++;
2108 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002109 else {
2110 startinpos = s-starts;
2111 s++;
2112 errmsg = "unexpected special character";
2113 goto utf7Error;
2114 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 outpos = p-PyUnicode_AS_UNICODE(unicode);
2118 endinpos = s-starts;
2119 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002120 errors, &errorHandler,
2121 "utf7", errmsg,
2122 &starts, &e, &startinpos, &endinpos, &exc, &s,
2123 &unicode, &outpos, &p))
2124 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125 }
2126
Antoine Pitrou244651a2009-05-04 18:56:13 +00002127 /* end of string */
2128
2129 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2130 /* if we're in an inconsistent state, that's an error */
2131 if (surrogate ||
2132 (base64bits >= 6) ||
2133 (base64bits > 0 && base64buffer != 0)) {
2134 outpos = p-PyUnicode_AS_UNICODE(unicode);
2135 endinpos = size;
2136 if (unicode_decode_call_errorhandler(
2137 errors, &errorHandler,
2138 "utf7", "unterminated shift sequence",
2139 &starts, &e, &startinpos, &endinpos, &exc, &s,
2140 &unicode, &outpos, &p))
2141 goto onError;
2142 if (s < e)
2143 goto restart;
2144 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002146
2147 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002148 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002149 if (inShift) {
2150 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002151 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002152 }
2153 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002154 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002155 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002156 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002157
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002158 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002159 goto onError;
2160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002161 Py_XDECREF(errorHandler);
2162 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 return (PyObject *)unicode;
2164
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 Py_XDECREF(errorHandler);
2167 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168 Py_DECREF(unicode);
2169 return NULL;
2170}
2171
2172
2173PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002174 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175 int base64SetO,
2176 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002177 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002178{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002179 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002180 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002181 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002182 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002184 unsigned int base64bits = 0;
2185 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 char * out;
2187 char * start;
2188
2189 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002190 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002191
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002192 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002193 return PyErr_NoMemory();
2194
Antoine Pitrou244651a2009-05-04 18:56:13 +00002195 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196 if (v == NULL)
2197 return NULL;
2198
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002199 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 for (;i < size; ++i) {
2201 Py_UNICODE ch = s[i];
2202
Antoine Pitrou244651a2009-05-04 18:56:13 +00002203 if (inShift) {
2204 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2205 /* shifting out */
2206 if (base64bits) { /* output remaining bits */
2207 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2208 base64buffer = 0;
2209 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210 }
2211 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212 /* Characters not in the BASE64 set implicitly unshift the sequence
2213 so no '-' is required, except if the character is itself a '-' */
2214 if (IS_BASE64(ch) || ch == '-') {
2215 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002216 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217 *out++ = (char) ch;
2218 }
2219 else {
2220 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002221 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223 else { /* not in a shift sequence */
2224 if (ch == '+') {
2225 *out++ = '+';
2226 *out++ = '-';
2227 }
2228 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2229 *out++ = (char) ch;
2230 }
2231 else {
2232 *out++ = '+';
2233 inShift = 1;
2234 goto encode_char;
2235 }
2236 }
2237 continue;
2238encode_char:
2239#ifdef Py_UNICODE_WIDE
2240 if (ch >= 0x10000) {
2241 /* code first surrogate */
2242 base64bits += 16;
2243 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2244 while (base64bits >= 6) {
2245 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2246 base64bits -= 6;
2247 }
2248 /* prepare second surrogate */
2249 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2250 }
2251#endif
2252 base64bits += 16;
2253 base64buffer = (base64buffer << 16) | ch;
2254 while (base64bits >= 6) {
2255 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2256 base64bits -= 6;
2257 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002258 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002259 if (base64bits)
2260 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2261 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002263 if (_PyBytes_Resize(&v, out - start) < 0)
2264 return NULL;
2265 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266}
2267
Antoine Pitrou244651a2009-05-04 18:56:13 +00002268#undef IS_BASE64
2269#undef FROM_BASE64
2270#undef TO_BASE64
2271#undef DECODE_DIRECT
2272#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274/* --- UTF-8 Codec -------------------------------------------------------- */
2275
Tim Petersced69f82003-09-16 20:30:58 +00002276static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002278 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2279 illegal prefix. See RFC 3629 for details */
2280 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2281 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2282 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002291 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2292 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2293 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2294 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2295 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296};
2297
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002299 Py_ssize_t size,
2300 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301{
Walter Dörwald69652032004-09-07 20:24:22 +00002302 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2303}
2304
Antoine Pitrouab868312009-01-10 15:40:25 +00002305/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2306#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2307
2308/* Mask to quickly check whether a C 'long' contains a
2309 non-ASCII, UTF8-encoded char. */
2310#if (SIZEOF_LONG == 8)
2311# define ASCII_CHAR_MASK 0x8080808080808080L
2312#elif (SIZEOF_LONG == 4)
2313# define ASCII_CHAR_MASK 0x80808080L
2314#else
2315# error C 'long' size should be either 4 or 8!
2316#endif
2317
Walter Dörwald69652032004-09-07 20:24:22 +00002318PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002319 Py_ssize_t size,
2320 const char *errors,
2321 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002322{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002323 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002325 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002326 Py_ssize_t startinpos;
2327 Py_ssize_t endinpos;
2328 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002329 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 PyUnicodeObject *unicode;
2331 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002332 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002333 PyObject *errorHandler = NULL;
2334 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335
2336 /* Note: size will always be longer than the resulting Unicode
2337 character count */
2338 unicode = _PyUnicode_New(size);
2339 if (!unicode)
2340 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002341 if (size == 0) {
2342 if (consumed)
2343 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346
2347 /* Unpack UTF-8 encoded data */
2348 p = unicode->str;
2349 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002350 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351
2352 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002353 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354
2355 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002356 /* Fast path for runs of ASCII characters. Given that common UTF-8
2357 input will consist of an overwhelming majority of ASCII
2358 characters, we try to optimize for this case by checking
2359 as many characters as a C 'long' can contain.
2360 First, check if we can do an aligned read, as most CPUs have
2361 a penalty for unaligned reads.
2362 */
2363 if (!((size_t) s & LONG_PTR_MASK)) {
2364 /* Help register allocation */
2365 register const char *_s = s;
2366 register Py_UNICODE *_p = p;
2367 while (_s < aligned_end) {
2368 /* Read a whole long at a time (either 4 or 8 bytes),
2369 and do a fast unrolled copy if it only contains ASCII
2370 characters. */
2371 unsigned long data = *(unsigned long *) _s;
2372 if (data & ASCII_CHAR_MASK)
2373 break;
2374 _p[0] = (unsigned char) _s[0];
2375 _p[1] = (unsigned char) _s[1];
2376 _p[2] = (unsigned char) _s[2];
2377 _p[3] = (unsigned char) _s[3];
2378#if (SIZEOF_LONG == 8)
2379 _p[4] = (unsigned char) _s[4];
2380 _p[5] = (unsigned char) _s[5];
2381 _p[6] = (unsigned char) _s[6];
2382 _p[7] = (unsigned char) _s[7];
2383#endif
2384 _s += SIZEOF_LONG;
2385 _p += SIZEOF_LONG;
2386 }
2387 s = _s;
2388 p = _p;
2389 if (s == e)
2390 break;
2391 ch = (unsigned char)*s;
2392 }
2393 }
2394
2395 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002396 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 s++;
2398 continue;
2399 }
2400
2401 n = utf8_code_length[ch];
2402
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002403 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002404 if (consumed)
2405 break;
2406 else {
2407 errmsg = "unexpected end of data";
2408 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002409 endinpos = startinpos+1;
2410 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2411 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 goto utf8Error;
2413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415
2416 switch (n) {
2417
2418 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002419 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 startinpos = s-starts;
2421 endinpos = startinpos+1;
2422 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423
2424 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002425 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 startinpos = s-starts;
2427 endinpos = startinpos+1;
2428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429
2430 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002431 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002432 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002433 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002434 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002435 goto utf8Error;
2436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002438 assert ((ch > 0x007F) && (ch <= 0x07FF));
2439 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 break;
2441
2442 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002443 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2444 will result in surrogates in range d800-dfff. Surrogates are
2445 not valid UTF-8 so they are rejected.
2446 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2447 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002448 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002449 (s[2] & 0xc0) != 0x80 ||
2450 ((unsigned char)s[0] == 0xE0 &&
2451 (unsigned char)s[1] < 0xA0) ||
2452 ((unsigned char)s[0] == 0xED &&
2453 (unsigned char)s[1] > 0x9F)) {
2454 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002455 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002456 endinpos = startinpos + 1;
2457
2458 /* if s[1] first two bits are 1 and 0, then the invalid
2459 continuation byte is s[2], so increment endinpos by 1,
2460 if not, s[1] is invalid and endinpos doesn't need to
2461 be incremented. */
2462 if ((s[1] & 0xC0) == 0x80)
2463 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 goto utf8Error;
2465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002467 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2468 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002469 break;
2470
2471 case 4:
2472 if ((s[1] & 0xc0) != 0x80 ||
2473 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002474 (s[3] & 0xc0) != 0x80 ||
2475 ((unsigned char)s[0] == 0xF0 &&
2476 (unsigned char)s[1] < 0x90) ||
2477 ((unsigned char)s[0] == 0xF4 &&
2478 (unsigned char)s[1] > 0x8F)) {
2479 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002481 endinpos = startinpos + 1;
2482 if ((s[1] & 0xC0) == 0x80) {
2483 endinpos++;
2484 if ((s[2] & 0xC0) == 0x80)
2485 endinpos++;
2486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002487 goto utf8Error;
2488 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002489 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002490 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2491 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2492
Fredrik Lundh8f455852001-06-27 18:59:43 +00002493#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002495#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002496 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002497
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002498 /* translate from 10000..10FFFF to 0..FFFF */
2499 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002500
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002501 /* high surrogate = top 10 bits added to D800 */
2502 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002503
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002504 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002505 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002506#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 }
2509 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002510 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002511
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 utf8Error:
2513 outpos = p-PyUnicode_AS_UNICODE(unicode);
2514 if (unicode_decode_call_errorhandler(
2515 errors, &errorHandler,
2516 "utf8", errmsg,
2517 &starts, &e, &startinpos, &endinpos, &exc, &s,
2518 &unicode, &outpos, &p))
2519 goto onError;
2520 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 }
Walter Dörwald69652032004-09-07 20:24:22 +00002522 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002523 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524
2525 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002526 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 goto onError;
2528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 Py_XDECREF(errorHandler);
2530 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 return (PyObject *)unicode;
2532
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 Py_XDECREF(errorHandler);
2535 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 Py_DECREF(unicode);
2537 return NULL;
2538}
2539
Antoine Pitrouab868312009-01-10 15:40:25 +00002540#undef ASCII_CHAR_MASK
2541
2542
Tim Peters602f7402002-04-27 18:03:26 +00002543/* Allocation strategy: if the string is short, convert into a stack buffer
2544 and allocate exactly as much space needed at the end. Else allocate the
2545 maximum possible needed (4 result bytes per Unicode character), and return
2546 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002547*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002548PyObject *
2549PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 Py_ssize_t size,
2551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552{
Tim Peters602f7402002-04-27 18:03:26 +00002553#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002554
Guido van Rossum98297ee2007-11-06 21:34:58 +00002555 Py_ssize_t i; /* index into s of next input byte */
2556 PyObject *result; /* result string object */
2557 char *p; /* next free byte in output buffer */
2558 Py_ssize_t nallocated; /* number of result bytes allocated */
2559 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002560 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002561 PyObject *errorHandler = NULL;
2562 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002563
Tim Peters602f7402002-04-27 18:03:26 +00002564 assert(s != NULL);
2565 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566
Tim Peters602f7402002-04-27 18:03:26 +00002567 if (size <= MAX_SHORT_UNICHARS) {
2568 /* Write into the stack buffer; nallocated can't overflow.
2569 * At the end, we'll allocate exactly as much heap space as it
2570 * turns out we need.
2571 */
2572 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002573 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002574 p = stackbuf;
2575 }
2576 else {
2577 /* Overallocate on the heap, and give the excess back at the end. */
2578 nallocated = size * 4;
2579 if (nallocated / 4 != size) /* overflow! */
2580 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002581 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002582 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002583 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002584 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002585 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002586
Tim Peters602f7402002-04-27 18:03:26 +00002587 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002588 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002589
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002590 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002591 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002593
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002595 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002596 *p++ = (char)(0xc0 | (ch >> 6));
2597 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002598 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002599#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002600 /* Special case: check for high and low surrogate */
2601 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2602 Py_UCS4 ch2 = s[i];
2603 /* Combine the two surrogates to form a UCS4 value */
2604 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2605 i++;
2606
2607 /* Encode UCS4 Unicode ordinals */
2608 *p++ = (char)(0xf0 | (ch >> 18));
2609 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002610 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2611 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002612 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002613#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002614 Py_ssize_t newpos;
2615 PyObject *rep;
2616 Py_ssize_t repsize, k;
2617 rep = unicode_encode_call_errorhandler
2618 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2619 s, size, &exc, i-1, i, &newpos);
2620 if (!rep)
2621 goto error;
2622
2623 if (PyBytes_Check(rep))
2624 repsize = PyBytes_GET_SIZE(rep);
2625 else
2626 repsize = PyUnicode_GET_SIZE(rep);
2627
2628 if (repsize > 4) {
2629 Py_ssize_t offset;
2630
2631 if (result == NULL)
2632 offset = p - stackbuf;
2633 else
2634 offset = p - PyBytes_AS_STRING(result);
2635
2636 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2637 /* integer overflow */
2638 PyErr_NoMemory();
2639 goto error;
2640 }
2641 nallocated += repsize - 4;
2642 if (result != NULL) {
2643 if (_PyBytes_Resize(&result, nallocated) < 0)
2644 goto error;
2645 } else {
2646 result = PyBytes_FromStringAndSize(NULL, nallocated);
2647 if (result == NULL)
2648 goto error;
2649 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2650 }
2651 p = PyBytes_AS_STRING(result) + offset;
2652 }
2653
2654 if (PyBytes_Check(rep)) {
2655 char *prep = PyBytes_AS_STRING(rep);
2656 for(k = repsize; k > 0; k--)
2657 *p++ = *prep++;
2658 } else /* rep is unicode */ {
2659 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2660 Py_UNICODE c;
2661
2662 for(k=0; k<repsize; k++) {
2663 c = prep[k];
2664 if (0x80 <= c) {
2665 raise_encode_exception(&exc, "utf-8", s, size,
2666 i-1, i, "surrogates not allowed");
2667 goto error;
2668 }
2669 *p++ = (char)prep[k];
2670 }
2671 }
2672 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002673#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002674 }
Victor Stinner445a6232010-04-22 20:01:57 +00002675#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002676 } else if (ch < 0x10000) {
2677 *p++ = (char)(0xe0 | (ch >> 12));
2678 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2679 *p++ = (char)(0x80 | (ch & 0x3f));
2680 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002681 /* Encode UCS4 Unicode ordinals */
2682 *p++ = (char)(0xf0 | (ch >> 18));
2683 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2684 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2685 *p++ = (char)(0x80 | (ch & 0x3f));
2686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002688
Guido van Rossum98297ee2007-11-06 21:34:58 +00002689 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002690 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002691 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002692 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002693 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002694 }
2695 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002696 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002697 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002698 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002699 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002700 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002701 Py_XDECREF(errorHandler);
2702 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002703 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002704 error:
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 Py_XDECREF(result);
2708 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002709
Tim Peters602f7402002-04-27 18:03:26 +00002710#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711}
2712
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 if (!PyUnicode_Check(unicode)) {
2716 PyErr_BadArgument();
2717 return NULL;
2718 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002719 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 PyUnicode_GET_SIZE(unicode),
2721 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722}
2723
Walter Dörwald41980ca2007-08-16 21:55:45 +00002724/* --- UTF-32 Codec ------------------------------------------------------- */
2725
2726PyObject *
2727PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 Py_ssize_t size,
2729 const char *errors,
2730 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002731{
2732 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2733}
2734
2735PyObject *
2736PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 Py_ssize_t size,
2738 const char *errors,
2739 int *byteorder,
2740 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002741{
2742 const char *starts = s;
2743 Py_ssize_t startinpos;
2744 Py_ssize_t endinpos;
2745 Py_ssize_t outpos;
2746 PyUnicodeObject *unicode;
2747 Py_UNICODE *p;
2748#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002749 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002750 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002751#else
2752 const int pairs = 0;
2753#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002754 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002755 int bo = 0; /* assume native ordering by default */
2756 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002757 /* Offsets from q for retrieving bytes in the right order. */
2758#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2759 int iorder[] = {0, 1, 2, 3};
2760#else
2761 int iorder[] = {3, 2, 1, 0};
2762#endif
2763 PyObject *errorHandler = NULL;
2764 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002765
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766 q = (unsigned char *)s;
2767 e = q + size;
2768
2769 if (byteorder)
2770 bo = *byteorder;
2771
2772 /* Check for BOM marks (U+FEFF) in the input and adjust current
2773 byte order setting accordingly. In native mode, the leading BOM
2774 mark is skipped, in all other modes, it is copied to the output
2775 stream as-is (giving a ZWNBSP character). */
2776 if (bo == 0) {
2777 if (size >= 4) {
2778 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002781 if (bom == 0x0000FEFF) {
2782 q += 4;
2783 bo = -1;
2784 }
2785 else if (bom == 0xFFFE0000) {
2786 q += 4;
2787 bo = 1;
2788 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002789#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 if (bom == 0x0000FEFF) {
2791 q += 4;
2792 bo = 1;
2793 }
2794 else if (bom == 0xFFFE0000) {
2795 q += 4;
2796 bo = -1;
2797 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002798#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002800 }
2801
2802 if (bo == -1) {
2803 /* force LE */
2804 iorder[0] = 0;
2805 iorder[1] = 1;
2806 iorder[2] = 2;
2807 iorder[3] = 3;
2808 }
2809 else if (bo == 1) {
2810 /* force BE */
2811 iorder[0] = 3;
2812 iorder[1] = 2;
2813 iorder[2] = 1;
2814 iorder[3] = 0;
2815 }
2816
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002817 /* On narrow builds we split characters outside the BMP into two
2818 codepoints => count how much extra space we need. */
2819#ifndef Py_UNICODE_WIDE
2820 for (qq = q; qq < e; qq += 4)
2821 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2822 pairs++;
2823#endif
2824
2825 /* This might be one to much, because of a BOM */
2826 unicode = _PyUnicode_New((size+3)/4+pairs);
2827 if (!unicode)
2828 return NULL;
2829 if (size == 0)
2830 return (PyObject *)unicode;
2831
2832 /* Unpack UTF-32 encoded data */
2833 p = unicode->str;
2834
Walter Dörwald41980ca2007-08-16 21:55:45 +00002835 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 Py_UCS4 ch;
2837 /* remaining bytes at the end? (size should be divisible by 4) */
2838 if (e-q<4) {
2839 if (consumed)
2840 break;
2841 errmsg = "truncated data";
2842 startinpos = ((const char *)q)-starts;
2843 endinpos = ((const char *)e)-starts;
2844 goto utf32Error;
2845 /* The remaining input chars are ignored if the callback
2846 chooses to skip the input */
2847 }
2848 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2849 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002850
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 if (ch >= 0x110000)
2852 {
2853 errmsg = "codepoint not in range(0x110000)";
2854 startinpos = ((const char *)q)-starts;
2855 endinpos = startinpos+4;
2856 goto utf32Error;
2857 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002858#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 if (ch >= 0x10000)
2860 {
2861 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2862 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2863 }
2864 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002865#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 *p++ = ch;
2867 q += 4;
2868 continue;
2869 utf32Error:
2870 outpos = p-PyUnicode_AS_UNICODE(unicode);
2871 if (unicode_decode_call_errorhandler(
2872 errors, &errorHandler,
2873 "utf32", errmsg,
2874 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2875 &unicode, &outpos, &p))
2876 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002877 }
2878
2879 if (byteorder)
2880 *byteorder = bo;
2881
2882 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002884
2885 /* Adjust length */
2886 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2887 goto onError;
2888
2889 Py_XDECREF(errorHandler);
2890 Py_XDECREF(exc);
2891 return (PyObject *)unicode;
2892
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002894 Py_DECREF(unicode);
2895 Py_XDECREF(errorHandler);
2896 Py_XDECREF(exc);
2897 return NULL;
2898}
2899
2900PyObject *
2901PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 Py_ssize_t size,
2903 const char *errors,
2904 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002905{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002906 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002907 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002908 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002909#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002910 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002911#else
2912 const int pairs = 0;
2913#endif
2914 /* Offsets from p for storing byte pairs in the right order. */
2915#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2916 int iorder[] = {0, 1, 2, 3};
2917#else
2918 int iorder[] = {3, 2, 1, 0};
2919#endif
2920
Benjamin Peterson29060642009-01-31 22:14:21 +00002921#define STORECHAR(CH) \
2922 do { \
2923 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2924 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2925 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2926 p[iorder[0]] = (CH) & 0xff; \
2927 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002928 } while(0)
2929
2930 /* In narrow builds we can output surrogate pairs as one codepoint,
2931 so we need less space. */
2932#ifndef Py_UNICODE_WIDE
2933 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2935 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2936 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002937#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002938 nsize = (size - pairs + (byteorder == 0));
2939 bytesize = nsize * 4;
2940 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002942 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002943 if (v == NULL)
2944 return NULL;
2945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002946 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002947 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002949 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002950 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002951
2952 if (byteorder == -1) {
2953 /* force LE */
2954 iorder[0] = 0;
2955 iorder[1] = 1;
2956 iorder[2] = 2;
2957 iorder[3] = 3;
2958 }
2959 else if (byteorder == 1) {
2960 /* force BE */
2961 iorder[0] = 3;
2962 iorder[1] = 2;
2963 iorder[2] = 1;
2964 iorder[3] = 0;
2965 }
2966
2967 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002969#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2971 Py_UCS4 ch2 = *s;
2972 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2973 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2974 s++;
2975 size--;
2976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002977 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002978#endif
2979 STORECHAR(ch);
2980 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002981
2982 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002983 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002984#undef STORECHAR
2985}
2986
2987PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2988{
2989 if (!PyUnicode_Check(unicode)) {
2990 PyErr_BadArgument();
2991 return NULL;
2992 }
2993 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 PyUnicode_GET_SIZE(unicode),
2995 NULL,
2996 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002997}
2998
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999/* --- UTF-16 Codec ------------------------------------------------------- */
3000
Tim Peters772747b2001-08-09 22:21:55 +00003001PyObject *
3002PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 Py_ssize_t size,
3004 const char *errors,
3005 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006{
Walter Dörwald69652032004-09-07 20:24:22 +00003007 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3008}
3009
Antoine Pitrouab868312009-01-10 15:40:25 +00003010/* Two masks for fast checking of whether a C 'long' may contain
3011 UTF16-encoded surrogate characters. This is an efficient heuristic,
3012 assuming that non-surrogate characters with a code point >= 0x8000 are
3013 rare in most input.
3014 FAST_CHAR_MASK is used when the input is in native byte ordering,
3015 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003016*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003017#if (SIZEOF_LONG == 8)
3018# define FAST_CHAR_MASK 0x8000800080008000L
3019# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3020#elif (SIZEOF_LONG == 4)
3021# define FAST_CHAR_MASK 0x80008000L
3022# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3023#else
3024# error C 'long' size should be either 4 or 8!
3025#endif
3026
Walter Dörwald69652032004-09-07 20:24:22 +00003027PyObject *
3028PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 Py_ssize_t size,
3030 const char *errors,
3031 int *byteorder,
3032 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003033{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003035 Py_ssize_t startinpos;
3036 Py_ssize_t endinpos;
3037 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 PyUnicodeObject *unicode;
3039 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003040 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003041 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003042 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003043 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003044 /* Offsets from q for retrieving byte pairs in the right order. */
3045#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3046 int ihi = 1, ilo = 0;
3047#else
3048 int ihi = 0, ilo = 1;
3049#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 PyObject *errorHandler = NULL;
3051 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
3053 /* Note: size will always be longer than the resulting Unicode
3054 character count */
3055 unicode = _PyUnicode_New(size);
3056 if (!unicode)
3057 return NULL;
3058 if (size == 0)
3059 return (PyObject *)unicode;
3060
3061 /* Unpack UTF-16 encoded data */
3062 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003063 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003064 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003067 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003069 /* Check for BOM marks (U+FEFF) in the input and adjust current
3070 byte order setting accordingly. In native mode, the leading BOM
3071 mark is skipped, in all other modes, it is copied to the output
3072 stream as-is (giving a ZWNBSP character). */
3073 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003074 if (size >= 2) {
3075 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003076#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 if (bom == 0xFEFF) {
3078 q += 2;
3079 bo = -1;
3080 }
3081 else if (bom == 0xFFFE) {
3082 q += 2;
3083 bo = 1;
3084 }
Tim Petersced69f82003-09-16 20:30:58 +00003085#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 if (bom == 0xFEFF) {
3087 q += 2;
3088 bo = 1;
3089 }
3090 else if (bom == 0xFFFE) {
3091 q += 2;
3092 bo = -1;
3093 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003094#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
Tim Peters772747b2001-08-09 22:21:55 +00003098 if (bo == -1) {
3099 /* force LE */
3100 ihi = 1;
3101 ilo = 0;
3102 }
3103 else if (bo == 1) {
3104 /* force BE */
3105 ihi = 0;
3106 ilo = 1;
3107 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003108#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3109 native_ordering = ilo < ihi;
3110#else
3111 native_ordering = ilo > ihi;
3112#endif
Tim Peters772747b2001-08-09 22:21:55 +00003113
Antoine Pitrouab868312009-01-10 15:40:25 +00003114 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003115 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003117 /* First check for possible aligned read of a C 'long'. Unaligned
3118 reads are more expensive, better to defer to another iteration. */
3119 if (!((size_t) q & LONG_PTR_MASK)) {
3120 /* Fast path for runs of non-surrogate chars. */
3121 register const unsigned char *_q = q;
3122 Py_UNICODE *_p = p;
3123 if (native_ordering) {
3124 /* Native ordering is simple: as long as the input cannot
3125 possibly contain a surrogate char, do an unrolled copy
3126 of several 16-bit code points to the target object.
3127 The non-surrogate check is done on several input bytes
3128 at a time (as many as a C 'long' can contain). */
3129 while (_q < aligned_end) {
3130 unsigned long data = * (unsigned long *) _q;
3131 if (data & FAST_CHAR_MASK)
3132 break;
3133 _p[0] = ((unsigned short *) _q)[0];
3134 _p[1] = ((unsigned short *) _q)[1];
3135#if (SIZEOF_LONG == 8)
3136 _p[2] = ((unsigned short *) _q)[2];
3137 _p[3] = ((unsigned short *) _q)[3];
3138#endif
3139 _q += SIZEOF_LONG;
3140 _p += SIZEOF_LONG / 2;
3141 }
3142 }
3143 else {
3144 /* Byteswapped ordering is similar, but we must decompose
3145 the copy bytewise, and take care of zero'ing out the
3146 upper bytes if the target object is in 32-bit units
3147 (that is, in UCS-4 builds). */
3148 while (_q < aligned_end) {
3149 unsigned long data = * (unsigned long *) _q;
3150 if (data & SWAPPED_FAST_CHAR_MASK)
3151 break;
3152 /* Zero upper bytes in UCS-4 builds */
3153#if (Py_UNICODE_SIZE > 2)
3154 _p[0] = 0;
3155 _p[1] = 0;
3156#if (SIZEOF_LONG == 8)
3157 _p[2] = 0;
3158 _p[3] = 0;
3159#endif
3160#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003161 /* Issue #4916; UCS-4 builds on big endian machines must
3162 fill the two last bytes of each 4-byte unit. */
3163#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3164# define OFF 2
3165#else
3166# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003167#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003168 ((unsigned char *) _p)[OFF + 1] = _q[0];
3169 ((unsigned char *) _p)[OFF + 0] = _q[1];
3170 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3171 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3172#if (SIZEOF_LONG == 8)
3173 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3174 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3175 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3176 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3177#endif
3178#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003179 _q += SIZEOF_LONG;
3180 _p += SIZEOF_LONG / 2;
3181 }
3182 }
3183 p = _p;
3184 q = _q;
3185 if (q >= e)
3186 break;
3187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189
Benjamin Peterson14339b62009-01-31 16:36:08 +00003190 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003191
3192 if (ch < 0xD800 || ch > 0xDFFF) {
3193 *p++ = ch;
3194 continue;
3195 }
3196
3197 /* UTF-16 code pair: */
3198 if (q > e) {
3199 errmsg = "unexpected end of data";
3200 startinpos = (((const char *)q) - 2) - starts;
3201 endinpos = ((const char *)e) + 1 - starts;
3202 goto utf16Error;
3203 }
3204 if (0xD800 <= ch && ch <= 0xDBFF) {
3205 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3206 q += 2;
3207 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003208#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003209 *p++ = ch;
3210 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003211#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003213#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 continue;
3215 }
3216 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003217 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 startinpos = (((const char *)q)-4)-starts;
3219 endinpos = startinpos+2;
3220 goto utf16Error;
3221 }
3222
Benjamin Peterson14339b62009-01-31 16:36:08 +00003223 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 errmsg = "illegal encoding";
3225 startinpos = (((const char *)q)-2)-starts;
3226 endinpos = startinpos+2;
3227 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 utf16Error:
3230 outpos = p - PyUnicode_AS_UNICODE(unicode);
3231 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003232 errors,
3233 &errorHandler,
3234 "utf16", errmsg,
3235 &starts,
3236 (const char **)&e,
3237 &startinpos,
3238 &endinpos,
3239 &exc,
3240 (const char **)&q,
3241 &unicode,
3242 &outpos,
3243 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003246 /* remaining byte at the end? (size should be even) */
3247 if (e == q) {
3248 if (!consumed) {
3249 errmsg = "truncated data";
3250 startinpos = ((const char *)q) - starts;
3251 endinpos = ((const char *)e) + 1 - starts;
3252 outpos = p - PyUnicode_AS_UNICODE(unicode);
3253 if (unicode_decode_call_errorhandler(
3254 errors,
3255 &errorHandler,
3256 "utf16", errmsg,
3257 &starts,
3258 (const char **)&e,
3259 &startinpos,
3260 &endinpos,
3261 &exc,
3262 (const char **)&q,
3263 &unicode,
3264 &outpos,
3265 &p))
3266 goto onError;
3267 /* The remaining input chars are ignored if the callback
3268 chooses to skip the input */
3269 }
3270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271
3272 if (byteorder)
3273 *byteorder = bo;
3274
Walter Dörwald69652032004-09-07 20:24:22 +00003275 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003279 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 goto onError;
3281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 Py_XDECREF(errorHandler);
3283 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 return (PyObject *)unicode;
3285
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 Py_XDECREF(errorHandler);
3289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 return NULL;
3291}
3292
Antoine Pitrouab868312009-01-10 15:40:25 +00003293#undef FAST_CHAR_MASK
3294#undef SWAPPED_FAST_CHAR_MASK
3295
Tim Peters772747b2001-08-09 22:21:55 +00003296PyObject *
3297PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 Py_ssize_t size,
3299 const char *errors,
3300 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003302 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003303 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003304 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003305#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003306 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003307#else
3308 const int pairs = 0;
3309#endif
Tim Peters772747b2001-08-09 22:21:55 +00003310 /* Offsets from p for storing byte pairs in the right order. */
3311#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3312 int ihi = 1, ilo = 0;
3313#else
3314 int ihi = 0, ilo = 1;
3315#endif
3316
Benjamin Peterson29060642009-01-31 22:14:21 +00003317#define STORECHAR(CH) \
3318 do { \
3319 p[ihi] = ((CH) >> 8) & 0xff; \
3320 p[ilo] = (CH) & 0xff; \
3321 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003322 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003324#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003325 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 if (s[i] >= 0x10000)
3327 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003328#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003329 /* 2 * (size + pairs + (byteorder == 0)) */
3330 if (size > PY_SSIZE_T_MAX ||
3331 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003333 nsize = size + pairs + (byteorder == 0);
3334 bytesize = nsize * 2;
3335 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003337 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 if (v == NULL)
3339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003341 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003344 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003345 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003346
3347 if (byteorder == -1) {
3348 /* force LE */
3349 ihi = 1;
3350 ilo = 0;
3351 }
3352 else if (byteorder == 1) {
3353 /* force BE */
3354 ihi = 0;
3355 ilo = 1;
3356 }
3357
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003358 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 Py_UNICODE ch = *s++;
3360 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003361#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 if (ch >= 0x10000) {
3363 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3364 ch = 0xD800 | ((ch-0x10000) >> 10);
3365 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003366#endif
Tim Peters772747b2001-08-09 22:21:55 +00003367 STORECHAR(ch);
3368 if (ch2)
3369 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003370 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003371
3372 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003373 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003374#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375}
3376
3377PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3378{
3379 if (!PyUnicode_Check(unicode)) {
3380 PyErr_BadArgument();
3381 return NULL;
3382 }
3383 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 PyUnicode_GET_SIZE(unicode),
3385 NULL,
3386 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387}
3388
3389/* --- Unicode Escape Codec ----------------------------------------------- */
3390
Fredrik Lundh06d12682001-01-24 07:59:11 +00003391static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003392
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 Py_ssize_t size,
3395 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 Py_ssize_t startinpos;
3399 Py_ssize_t endinpos;
3400 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003405 char* message;
3406 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 PyObject *errorHandler = NULL;
3408 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003409
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 /* Escaped strings will always be longer than the resulting
3411 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 length after conversion to the true value.
3413 (but if the error callback returns a long replacement string
3414 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 v = _PyUnicode_New(size);
3416 if (v == NULL)
3417 goto onError;
3418 if (size == 0)
3419 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003423
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 while (s < end) {
3425 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003426 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428
3429 /* Non-escape characters are interpreted as Unicode ordinals */
3430 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003431 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 continue;
3433 }
3434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 /* \ - Escapes */
3437 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003438 c = *s++;
3439 if (s > end)
3440 c = '\0'; /* Invalid after \ */
3441 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 case '\n': break;
3445 case '\\': *p++ = '\\'; break;
3446 case '\'': *p++ = '\''; break;
3447 case '\"': *p++ = '\"'; break;
3448 case 'b': *p++ = '\b'; break;
3449 case 'f': *p++ = '\014'; break; /* FF */
3450 case 't': *p++ = '\t'; break;
3451 case 'n': *p++ = '\n'; break;
3452 case 'r': *p++ = '\r'; break;
3453 case 'v': *p++ = '\013'; break; /* VT */
3454 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3455
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 case '0': case '1': case '2': case '3':
3458 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003459 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003460 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003461 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003462 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003463 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003465 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 break;
3467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 /* hex escapes */
3469 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003471 digits = 2;
3472 message = "truncated \\xXX escape";
3473 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474
Benjamin Peterson29060642009-01-31 22:14:21 +00003475 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003477 digits = 4;
3478 message = "truncated \\uXXXX escape";
3479 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003482 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003483 digits = 8;
3484 message = "truncated \\UXXXXXXXX escape";
3485 hexescape:
3486 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 outpos = p-PyUnicode_AS_UNICODE(v);
3488 if (s+digits>end) {
3489 endinpos = size;
3490 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 errors, &errorHandler,
3492 "unicodeescape", "end of string in escape sequence",
3493 &starts, &end, &startinpos, &endinpos, &exc, &s,
3494 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 goto onError;
3496 goto nextByte;
3497 }
3498 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003499 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003500 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 endinpos = (s+i+1)-starts;
3502 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 errors, &errorHandler,
3504 "unicodeescape", message,
3505 &starts, &end, &startinpos, &endinpos, &exc, &s,
3506 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003507 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003509 }
3510 chr = (chr<<4) & ~0xF;
3511 if (c >= '0' && c <= '9')
3512 chr += c - '0';
3513 else if (c >= 'a' && c <= 'f')
3514 chr += 10 + c - 'a';
3515 else
3516 chr += 10 + c - 'A';
3517 }
3518 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003519 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 /* _decoding_error will have already written into the
3521 target buffer. */
3522 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003523 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003524 /* when we get here, chr is a 32-bit unicode character */
3525 if (chr <= 0xffff)
3526 /* UCS-2 character */
3527 *p++ = (Py_UNICODE) chr;
3528 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003529 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003530 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003531#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003532 *p++ = chr;
3533#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003534 chr -= 0x10000L;
3535 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003536 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003537#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003538 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 endinpos = s-starts;
3540 outpos = p-PyUnicode_AS_UNICODE(v);
3541 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 errors, &errorHandler,
3543 "unicodeescape", "illegal Unicode character",
3544 &starts, &end, &startinpos, &endinpos, &exc, &s,
3545 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003546 goto onError;
3547 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003548 break;
3549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003551 case 'N':
3552 message = "malformed \\N character escape";
3553 if (ucnhash_CAPI == NULL) {
3554 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003555 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003556 if (ucnhash_CAPI == NULL)
3557 goto ucnhashError;
3558 }
3559 if (*s == '{') {
3560 const char *start = s+1;
3561 /* look for the closing brace */
3562 while (*s != '}' && s < end)
3563 s++;
3564 if (s > start && s < end && *s == '}') {
3565 /* found a name. look it up in the unicode database */
3566 message = "unknown Unicode character name";
3567 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003568 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003569 goto store;
3570 }
3571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 endinpos = s-starts;
3573 outpos = p-PyUnicode_AS_UNICODE(v);
3574 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003575 errors, &errorHandler,
3576 "unicodeescape", message,
3577 &starts, &end, &startinpos, &endinpos, &exc, &s,
3578 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003579 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003580 break;
3581
3582 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003583 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 message = "\\ at end of string";
3585 s--;
3586 endinpos = s-starts;
3587 outpos = p-PyUnicode_AS_UNICODE(v);
3588 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 errors, &errorHandler,
3590 "unicodeescape", message,
3591 &starts, &end, &startinpos, &endinpos, &exc, &s,
3592 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003593 goto onError;
3594 }
3595 else {
3596 *p++ = '\\';
3597 *p++ = (unsigned char)s[-1];
3598 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003599 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003604 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003606 Py_XDECREF(errorHandler);
3607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003609
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003611 PyErr_SetString(
3612 PyExc_UnicodeError,
3613 "\\N escapes not supported (can't load unicodedata module)"
3614 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003615 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 Py_XDECREF(errorHandler);
3617 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003618 return NULL;
3619
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 return NULL;
3625}
3626
3627/* Return a Unicode-Escape string version of the Unicode object.
3628
3629 If quotes is true, the string is enclosed in u"" or u'' quotes as
3630 appropriate.
3631
3632*/
3633
Thomas Wouters477c8d52006-05-27 19:21:47 +00003634Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 Py_ssize_t size,
3636 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003637{
3638 /* like wcschr, but doesn't stop at NULL characters */
3639
3640 while (size-- > 0) {
3641 if (*s == ch)
3642 return s;
3643 s++;
3644 }
3645
3646 return NULL;
3647}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003648
Walter Dörwald79e913e2007-05-12 11:08:06 +00003649static const char *hexdigits = "0123456789abcdef";
3650
3651PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003654 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003657#ifdef Py_UNICODE_WIDE
3658 const Py_ssize_t expandsize = 10;
3659#else
3660 const Py_ssize_t expandsize = 6;
3661#endif
3662
Thomas Wouters89f507f2006-12-13 04:49:30 +00003663 /* XXX(nnorwitz): rather than over-allocating, it would be
3664 better to choose a different scheme. Perhaps scan the
3665 first N-chars of the string and allocate based on that size.
3666 */
3667 /* Initial allocation is based on the longest-possible unichr
3668 escape.
3669
3670 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3671 unichr, so in this case it's the longest unichr escape. In
3672 narrow (UTF-16) builds this is five chars per source unichr
3673 since there are two unichrs in the surrogate pair, so in narrow
3674 (UTF-16) builds it's not the longest unichr escape.
3675
3676 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3677 so in the narrow (UTF-16) build case it's the longest unichr
3678 escape.
3679 */
3680
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 if (size == 0)
3682 return PyBytes_FromStringAndSize(NULL, 0);
3683
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003686
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003687 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 2
3689 + expandsize*size
3690 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 if (repr == NULL)
3692 return NULL;
3693
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003694 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 while (size-- > 0) {
3697 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003698
Walter Dörwald79e913e2007-05-12 11:08:06 +00003699 /* Escape backslashes */
3700 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 *p++ = '\\';
3702 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003703 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003704 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003705
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003706#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003707 /* Map 21-bit characters to '\U00xxxxxx' */
3708 else if (ch >= 0x10000) {
3709 *p++ = '\\';
3710 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003711 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3712 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3713 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3714 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3715 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3716 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3717 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3718 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003720 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003721#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003722 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3723 else if (ch >= 0xD800 && ch < 0xDC00) {
3724 Py_UNICODE ch2;
3725 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003726
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 ch2 = *s++;
3728 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003729 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3731 *p++ = '\\';
3732 *p++ = 'U';
3733 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3734 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3735 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3736 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3737 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3738 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3739 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3740 *p++ = hexdigits[ucs & 0x0000000F];
3741 continue;
3742 }
3743 /* Fall through: isolated surrogates are copied as-is */
3744 s--;
3745 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003746 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003747#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003748
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003750 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 *p++ = '\\';
3752 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003753 *p++ = hexdigits[(ch >> 12) & 0x000F];
3754 *p++ = hexdigits[(ch >> 8) & 0x000F];
3755 *p++ = hexdigits[(ch >> 4) & 0x000F];
3756 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003758
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003759 /* Map special whitespace to '\t', \n', '\r' */
3760 else if (ch == '\t') {
3761 *p++ = '\\';
3762 *p++ = 't';
3763 }
3764 else if (ch == '\n') {
3765 *p++ = '\\';
3766 *p++ = 'n';
3767 }
3768 else if (ch == '\r') {
3769 *p++ = '\\';
3770 *p++ = 'r';
3771 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003772
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003773 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003774 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003776 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003777 *p++ = hexdigits[(ch >> 4) & 0x000F];
3778 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003779 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 /* Copy everything else as-is */
3782 else
3783 *p++ = (char) ch;
3784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003786 assert(p - PyBytes_AS_STRING(repr) > 0);
3787 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3788 return NULL;
3789 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790}
3791
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003792PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003794 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 if (!PyUnicode_Check(unicode)) {
3796 PyErr_BadArgument();
3797 return NULL;
3798 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003799 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3800 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003801 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802}
3803
3804/* --- Raw Unicode Escape Codec ------------------------------------------- */
3805
3806PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 Py_ssize_t size,
3808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003811 Py_ssize_t startinpos;
3812 Py_ssize_t endinpos;
3813 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 const char *end;
3817 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 PyObject *errorHandler = NULL;
3819 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003820
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 /* Escaped strings will always be longer than the resulting
3822 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 length after conversion to the true value. (But decoding error
3824 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 v = _PyUnicode_New(size);
3826 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 end = s + size;
3832 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 unsigned char c;
3834 Py_UCS4 x;
3835 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003836 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 /* Non-escape characters are interpreted as Unicode ordinals */
3839 if (*s != '\\') {
3840 *p++ = (unsigned char)*s++;
3841 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 startinpos = s-starts;
3844
3845 /* \u-escapes are only interpreted iff the number of leading
3846 backslashes if odd */
3847 bs = s;
3848 for (;s < end;) {
3849 if (*s != '\\')
3850 break;
3851 *p++ = (unsigned char)*s++;
3852 }
3853 if (((s - bs) & 1) == 0 ||
3854 s >= end ||
3855 (*s != 'u' && *s != 'U')) {
3856 continue;
3857 }
3858 p--;
3859 count = *s=='u' ? 4 : 8;
3860 s++;
3861
3862 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3863 outpos = p-PyUnicode_AS_UNICODE(v);
3864 for (x = 0, i = 0; i < count; ++i, ++s) {
3865 c = (unsigned char)*s;
3866 if (!ISXDIGIT(c)) {
3867 endinpos = s-starts;
3868 if (unicode_decode_call_errorhandler(
3869 errors, &errorHandler,
3870 "rawunicodeescape", "truncated \\uXXXX",
3871 &starts, &end, &startinpos, &endinpos, &exc, &s,
3872 &v, &outpos, &p))
3873 goto onError;
3874 goto nextByte;
3875 }
3876 x = (x<<4) & ~0xF;
3877 if (c >= '0' && c <= '9')
3878 x += c - '0';
3879 else if (c >= 'a' && c <= 'f')
3880 x += 10 + c - 'a';
3881 else
3882 x += 10 + c - 'A';
3883 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003884 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003885 /* UCS-2 character */
3886 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003887 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003888 /* UCS-4 character. Either store directly, or as
3889 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003890#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003892#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 x -= 0x10000L;
3894 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3895 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003896#endif
3897 } else {
3898 endinpos = s-starts;
3899 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003900 if (unicode_decode_call_errorhandler(
3901 errors, &errorHandler,
3902 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 &starts, &end, &startinpos, &endinpos, &exc, &s,
3904 &v, &outpos, &p))
3905 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003907 nextByte:
3908 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003910 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 Py_XDECREF(errorHandler);
3913 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003915
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 Py_XDECREF(errorHandler);
3919 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 return NULL;
3921}
3922
3923PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003926 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 char *p;
3928 char *q;
3929
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003930#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003931 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003932#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003933 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003934#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003935
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003936 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003937 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003938
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003939 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 if (repr == NULL)
3941 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003942 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003943 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003945 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 while (size-- > 0) {
3947 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003948#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 /* Map 32-bit characters to '\Uxxxxxxxx' */
3950 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003951 *p++ = '\\';
3952 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003953 *p++ = hexdigits[(ch >> 28) & 0xf];
3954 *p++ = hexdigits[(ch >> 24) & 0xf];
3955 *p++ = hexdigits[(ch >> 20) & 0xf];
3956 *p++ = hexdigits[(ch >> 16) & 0xf];
3957 *p++ = hexdigits[(ch >> 12) & 0xf];
3958 *p++ = hexdigits[(ch >> 8) & 0xf];
3959 *p++ = hexdigits[(ch >> 4) & 0xf];
3960 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003961 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003962 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003963#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3965 if (ch >= 0xD800 && ch < 0xDC00) {
3966 Py_UNICODE ch2;
3967 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003968
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 ch2 = *s++;
3970 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003971 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3973 *p++ = '\\';
3974 *p++ = 'U';
3975 *p++ = hexdigits[(ucs >> 28) & 0xf];
3976 *p++ = hexdigits[(ucs >> 24) & 0xf];
3977 *p++ = hexdigits[(ucs >> 20) & 0xf];
3978 *p++ = hexdigits[(ucs >> 16) & 0xf];
3979 *p++ = hexdigits[(ucs >> 12) & 0xf];
3980 *p++ = hexdigits[(ucs >> 8) & 0xf];
3981 *p++ = hexdigits[(ucs >> 4) & 0xf];
3982 *p++ = hexdigits[ucs & 0xf];
3983 continue;
3984 }
3985 /* Fall through: isolated surrogates are copied as-is */
3986 s--;
3987 size++;
3988 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003989#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 /* Map 16-bit characters to '\uxxxx' */
3991 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 *p++ = '\\';
3993 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003994 *p++ = hexdigits[(ch >> 12) & 0xf];
3995 *p++ = hexdigits[(ch >> 8) & 0xf];
3996 *p++ = hexdigits[(ch >> 4) & 0xf];
3997 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 /* Copy everything else as-is */
4000 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 *p++ = (char) ch;
4002 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004003 size = p - q;
4004
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004005 assert(size > 0);
4006 if (_PyBytes_Resize(&repr, size) < 0)
4007 return NULL;
4008 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009}
4010
4011PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4012{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004013 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004015 PyErr_BadArgument();
4016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004018 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4019 PyUnicode_GET_SIZE(unicode));
4020
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004021 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022}
4023
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004024/* --- Unicode Internal Codec ------------------------------------------- */
4025
4026PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004027 Py_ssize_t size,
4028 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004029{
4030 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t startinpos;
4032 Py_ssize_t endinpos;
4033 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004034 PyUnicodeObject *v;
4035 Py_UNICODE *p;
4036 const char *end;
4037 const char *reason;
4038 PyObject *errorHandler = NULL;
4039 PyObject *exc = NULL;
4040
Neal Norwitzd43069c2006-01-08 01:12:10 +00004041#ifdef Py_UNICODE_WIDE
4042 Py_UNICODE unimax = PyUnicode_GetMax();
4043#endif
4044
Thomas Wouters89f507f2006-12-13 04:49:30 +00004045 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004046 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4047 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004049 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004051 p = PyUnicode_AS_UNICODE(v);
4052 end = s + size;
4053
4054 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004055 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004056 /* We have to sanity check the raw data, otherwise doom looms for
4057 some malformed UCS-4 data. */
4058 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004059#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004060 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004061#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004062 end-s < Py_UNICODE_SIZE
4063 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004065 startinpos = s - starts;
4066 if (end-s < Py_UNICODE_SIZE) {
4067 endinpos = end-starts;
4068 reason = "truncated input";
4069 }
4070 else {
4071 endinpos = s - starts + Py_UNICODE_SIZE;
4072 reason = "illegal code point (> 0x10FFFF)";
4073 }
4074 outpos = p - PyUnicode_AS_UNICODE(v);
4075 if (unicode_decode_call_errorhandler(
4076 errors, &errorHandler,
4077 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004078 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004079 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004080 goto onError;
4081 }
4082 }
4083 else {
4084 p++;
4085 s += Py_UNICODE_SIZE;
4086 }
4087 }
4088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004089 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004090 goto onError;
4091 Py_XDECREF(errorHandler);
4092 Py_XDECREF(exc);
4093 return (PyObject *)v;
4094
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004096 Py_XDECREF(v);
4097 Py_XDECREF(errorHandler);
4098 Py_XDECREF(exc);
4099 return NULL;
4100}
4101
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102/* --- Latin-1 Codec ------------------------------------------------------ */
4103
4104PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 Py_ssize_t size,
4106 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107{
4108 PyUnicodeObject *v;
4109 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004110 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004113 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 Py_UNICODE r = *(unsigned char*)s;
4115 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004116 }
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 v = _PyUnicode_New(size);
4119 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004124 e = s + size;
4125 /* Unrolling the copy makes it much faster by reducing the looping
4126 overhead. This is similar to what many memcpy() implementations do. */
4127 unrolled_end = e - 4;
4128 while (s < unrolled_end) {
4129 p[0] = (unsigned char) s[0];
4130 p[1] = (unsigned char) s[1];
4131 p[2] = (unsigned char) s[2];
4132 p[3] = (unsigned char) s[3];
4133 s += 4;
4134 p += 4;
4135 }
4136 while (s < e)
4137 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004139
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 Py_XDECREF(v);
4142 return NULL;
4143}
4144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145/* create or adjust a UnicodeEncodeError */
4146static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 const char *encoding,
4148 const Py_UNICODE *unicode, Py_ssize_t size,
4149 Py_ssize_t startpos, Py_ssize_t endpos,
4150 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 *exceptionObject = PyUnicodeEncodeError_Create(
4154 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 }
4156 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4158 goto onError;
4159 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4160 goto onError;
4161 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4162 goto onError;
4163 return;
4164 onError:
4165 Py_DECREF(*exceptionObject);
4166 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 }
4168}
4169
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170/* raises a UnicodeEncodeError */
4171static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 const char *encoding,
4173 const Py_UNICODE *unicode, Py_ssize_t size,
4174 Py_ssize_t startpos, Py_ssize_t endpos,
4175 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176{
4177 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181}
4182
4183/* error handling callback helper:
4184 build arguments, call the callback and check the arguments,
4185 put the result into newpos and return the replacement string, which
4186 has to be freed by the caller */
4187static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 PyObject **errorHandler,
4189 const char *encoding, const char *reason,
4190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4191 Py_ssize_t startpos, Py_ssize_t endpos,
4192 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004194 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
4196 PyObject *restuple;
4197 PyObject *resunicode;
4198
4199 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 }
4204
4205 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209
4210 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004215 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 Py_DECREF(restuple);
4217 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004219 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 &resunicode, newpos)) {
4221 Py_DECREF(restuple);
4222 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004224 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4225 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4226 Py_DECREF(restuple);
4227 return NULL;
4228 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004231 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4233 Py_DECREF(restuple);
4234 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 Py_INCREF(resunicode);
4237 Py_DECREF(restuple);
4238 return resunicode;
4239}
4240
4241static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 Py_ssize_t size,
4243 const char *errors,
4244 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245{
4246 /* output object */
4247 PyObject *res;
4248 /* pointers to the beginning and end+1 of input */
4249 const Py_UNICODE *startp = p;
4250 const Py_UNICODE *endp = p + size;
4251 /* pointer to the beginning of the unencodable characters */
4252 /* const Py_UNICODE *badp = NULL; */
4253 /* pointer into the output */
4254 char *str;
4255 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004256 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004257 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4258 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 PyObject *errorHandler = NULL;
4260 PyObject *exc = NULL;
4261 /* the following variable is used for caching string comparisons
4262 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4263 int known_errorHandler = -1;
4264
4265 /* allocate enough for a simple encoding without
4266 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004267 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004268 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004269 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004271 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004272 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 ressize = size;
4274
4275 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 /* can we encode this? */
4279 if (c<limit) {
4280 /* no overflow check, because we know that the space is enough */
4281 *str++ = (char)c;
4282 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 else {
4285 Py_ssize_t unicodepos = p-startp;
4286 Py_ssize_t requiredsize;
4287 PyObject *repunicode;
4288 Py_ssize_t repsize;
4289 Py_ssize_t newpos;
4290 Py_ssize_t respos;
4291 Py_UNICODE *uni2;
4292 /* startpos for collecting unencodable chars */
4293 const Py_UNICODE *collstart = p;
4294 const Py_UNICODE *collend = p;
4295 /* find all unecodable characters */
4296 while ((collend < endp) && ((*collend)>=limit))
4297 ++collend;
4298 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4299 if (known_errorHandler==-1) {
4300 if ((errors==NULL) || (!strcmp(errors, "strict")))
4301 known_errorHandler = 1;
4302 else if (!strcmp(errors, "replace"))
4303 known_errorHandler = 2;
4304 else if (!strcmp(errors, "ignore"))
4305 known_errorHandler = 3;
4306 else if (!strcmp(errors, "xmlcharrefreplace"))
4307 known_errorHandler = 4;
4308 else
4309 known_errorHandler = 0;
4310 }
4311 switch (known_errorHandler) {
4312 case 1: /* strict */
4313 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4314 goto onError;
4315 case 2: /* replace */
4316 while (collstart++<collend)
4317 *str++ = '?'; /* fall through */
4318 case 3: /* ignore */
4319 p = collend;
4320 break;
4321 case 4: /* xmlcharrefreplace */
4322 respos = str - PyBytes_AS_STRING(res);
4323 /* determine replacement size (temporarily (mis)uses p) */
4324 for (p = collstart, repsize = 0; p < collend; ++p) {
4325 if (*p<10)
4326 repsize += 2+1+1;
4327 else if (*p<100)
4328 repsize += 2+2+1;
4329 else if (*p<1000)
4330 repsize += 2+3+1;
4331 else if (*p<10000)
4332 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004333#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 else
4335 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004336#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 else if (*p<100000)
4338 repsize += 2+5+1;
4339 else if (*p<1000000)
4340 repsize += 2+6+1;
4341 else
4342 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004343#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 }
4345 requiredsize = respos+repsize+(endp-collend);
4346 if (requiredsize > ressize) {
4347 if (requiredsize<2*ressize)
4348 requiredsize = 2*ressize;
4349 if (_PyBytes_Resize(&res, requiredsize))
4350 goto onError;
4351 str = PyBytes_AS_STRING(res) + respos;
4352 ressize = requiredsize;
4353 }
4354 /* generate replacement (temporarily (mis)uses p) */
4355 for (p = collstart; p < collend; ++p) {
4356 str += sprintf(str, "&#%d;", (int)*p);
4357 }
4358 p = collend;
4359 break;
4360 default:
4361 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4362 encoding, reason, startp, size, &exc,
4363 collstart-startp, collend-startp, &newpos);
4364 if (repunicode == NULL)
4365 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004366 if (PyBytes_Check(repunicode)) {
4367 /* Directly copy bytes result to output. */
4368 repsize = PyBytes_Size(repunicode);
4369 if (repsize > 1) {
4370 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004371 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004372 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4373 Py_DECREF(repunicode);
4374 goto onError;
4375 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004376 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004377 ressize += repsize-1;
4378 }
4379 memcpy(str, PyBytes_AsString(repunicode), repsize);
4380 str += repsize;
4381 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004382 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004383 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004384 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 /* need more space? (at least enough for what we
4386 have+the replacement+the rest of the string, so
4387 we won't have to check space for encodable characters) */
4388 respos = str - PyBytes_AS_STRING(res);
4389 repsize = PyUnicode_GET_SIZE(repunicode);
4390 requiredsize = respos+repsize+(endp-collend);
4391 if (requiredsize > ressize) {
4392 if (requiredsize<2*ressize)
4393 requiredsize = 2*ressize;
4394 if (_PyBytes_Resize(&res, requiredsize)) {
4395 Py_DECREF(repunicode);
4396 goto onError;
4397 }
4398 str = PyBytes_AS_STRING(res) + respos;
4399 ressize = requiredsize;
4400 }
4401 /* check if there is anything unencodable in the replacement
4402 and copy it to the output */
4403 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4404 c = *uni2;
4405 if (c >= limit) {
4406 raise_encode_exception(&exc, encoding, startp, size,
4407 unicodepos, unicodepos+1, reason);
4408 Py_DECREF(repunicode);
4409 goto onError;
4410 }
4411 *str = (char)c;
4412 }
4413 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004414 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004415 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004416 }
4417 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004418 /* Resize if we allocated to much */
4419 size = str - PyBytes_AS_STRING(res);
4420 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004421 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004422 if (_PyBytes_Resize(&res, size) < 0)
4423 goto onError;
4424 }
4425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 Py_XDECREF(errorHandler);
4427 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004428 return res;
4429
4430 onError:
4431 Py_XDECREF(res);
4432 Py_XDECREF(errorHandler);
4433 Py_XDECREF(exc);
4434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435}
4436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 Py_ssize_t size,
4439 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442}
4443
4444PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4445{
4446 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 PyErr_BadArgument();
4448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 }
4450 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 PyUnicode_GET_SIZE(unicode),
4452 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453}
4454
4455/* --- 7-bit ASCII Codec -------------------------------------------------- */
4456
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 Py_ssize_t size,
4459 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 PyUnicodeObject *v;
4463 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004464 Py_ssize_t startinpos;
4465 Py_ssize_t endinpos;
4466 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 const char *e;
4468 PyObject *errorHandler = NULL;
4469 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004472 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 Py_UNICODE r = *(unsigned char*)s;
4474 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004475 }
Tim Petersced69f82003-09-16 20:30:58 +00004476
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 v = _PyUnicode_New(size);
4478 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 e = s + size;
4484 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 register unsigned char c = (unsigned char)*s;
4486 if (c < 128) {
4487 *p++ = c;
4488 ++s;
4489 }
4490 else {
4491 startinpos = s-starts;
4492 endinpos = startinpos + 1;
4493 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4494 if (unicode_decode_call_errorhandler(
4495 errors, &errorHandler,
4496 "ascii", "ordinal not in range(128)",
4497 &starts, &e, &startinpos, &endinpos, &exc, &s,
4498 &v, &outpos, &p))
4499 goto onError;
4500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004502 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4504 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 Py_XDECREF(errorHandler);
4506 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004508
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 Py_XDECREF(errorHandler);
4512 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 return NULL;
4514}
4515
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 Py_ssize_t size,
4518 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521}
4522
4523PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4524{
4525 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 PyErr_BadArgument();
4527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 }
4529 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 PyUnicode_GET_SIZE(unicode),
4531 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532}
4533
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004534#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004535
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004536/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004537
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004538#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004539#define NEED_RETRY
4540#endif
4541
4542/* XXX This code is limited to "true" double-byte encodings, as
4543 a) it assumes an incomplete character consists of a single byte, and
4544 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546
4547static int is_dbcs_lead_byte(const char *s, int offset)
4548{
4549 const char *curr = s + offset;
4550
4551 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 const char *prev = CharPrev(s, curr);
4553 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004554 }
4555 return 0;
4556}
4557
4558/*
4559 * Decode MBCS string into unicode object. If 'final' is set, converts
4560 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4561 */
4562static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 const char *s, /* MBCS string */
4564 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004565 int final,
4566 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004567{
4568 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004569 Py_ssize_t n;
4570 DWORD usize;
4571 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572
4573 assert(size >= 0);
4574
Victor Stinner554f3f02010-06-16 23:33:54 +00004575 /* check and handle 'errors' arg */
4576 if (errors==NULL || strcmp(errors, "strict")==0)
4577 flags = MB_ERR_INVALID_CHARS;
4578 else if (strcmp(errors, "ignore")==0)
4579 flags = 0;
4580 else {
4581 PyErr_Format(PyExc_ValueError,
4582 "mbcs encoding does not support errors='%s'",
4583 errors);
4584 return -1;
4585 }
4586
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587 /* Skip trailing lead-byte unless 'final' is set */
4588 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004590
4591 /* First get the size of the result */
4592 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004593 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4594 if (usize==0)
4595 goto mbcs_decode_error;
4596 } else
4597 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004598
4599 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 /* Create unicode object */
4601 *v = _PyUnicode_New(usize);
4602 if (*v == NULL)
4603 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004604 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605 }
4606 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 /* Extend unicode object */
4608 n = PyUnicode_GET_SIZE(*v);
4609 if (_PyUnicode_Resize(v, n + usize) < 0)
4610 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004611 }
4612
4613 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004614 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004616 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4617 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004619 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004621
4622mbcs_decode_error:
4623 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4624 we raise a UnicodeDecodeError - else it is a 'generic'
4625 windows error
4626 */
4627 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4628 /* Ideally, we should get reason from FormatMessage - this
4629 is the Windows 2000 English version of the message
4630 */
4631 PyObject *exc = NULL;
4632 const char *reason = "No mapping for the Unicode character exists "
4633 "in the target multi-byte code page.";
4634 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4635 if (exc != NULL) {
4636 PyCodec_StrictErrors(exc);
4637 Py_DECREF(exc);
4638 }
4639 } else {
4640 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4641 }
4642 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004643}
4644
4645PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 Py_ssize_t size,
4647 const char *errors,
4648 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004649{
4650 PyUnicodeObject *v = NULL;
4651 int done;
4652
4653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004655
4656#ifdef NEED_RETRY
4657 retry:
4658 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004659 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004660 else
4661#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004662 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004663
4664 if (done < 0) {
4665 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004667 }
4668
4669 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004671
4672#ifdef NEED_RETRY
4673 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 s += done;
4675 size -= done;
4676 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004677 }
4678#endif
4679
4680 return (PyObject *)v;
4681}
4682
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004683PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 Py_ssize_t size,
4685 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004686{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4688}
4689
4690/*
4691 * Convert unicode into string object (MBCS).
4692 * Returns 0 if succeed, -1 otherwise.
4693 */
4694static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004696 int size, /* size of unicode */
4697 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004698{
Victor Stinner554f3f02010-06-16 23:33:54 +00004699 BOOL usedDefaultChar = FALSE;
4700 BOOL *pusedDefaultChar;
4701 int mbcssize;
4702 Py_ssize_t n;
4703 PyObject *exc = NULL;
4704 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004705
4706 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004707
Victor Stinner554f3f02010-06-16 23:33:54 +00004708 /* check and handle 'errors' arg */
4709 if (errors==NULL || strcmp(errors, "strict")==0) {
4710 flags = WC_NO_BEST_FIT_CHARS;
4711 pusedDefaultChar = &usedDefaultChar;
4712 } else if (strcmp(errors, "replace")==0) {
4713 flags = 0;
4714 pusedDefaultChar = NULL;
4715 } else {
4716 PyErr_Format(PyExc_ValueError,
4717 "mbcs encoding does not support errors='%s'",
4718 errors);
4719 return -1;
4720 }
4721
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004722 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004723 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004724 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4725 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 if (mbcssize == 0) {
4727 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4728 return -1;
4729 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004730 /* If we used a default char, then we failed! */
4731 if (pusedDefaultChar && *pusedDefaultChar)
4732 goto mbcs_encode_error;
4733 } else {
4734 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004735 }
4736
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004737 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 /* Create string object */
4739 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4740 if (*repr == NULL)
4741 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004742 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004743 }
4744 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 /* Extend string object */
4746 n = PyBytes_Size(*repr);
4747 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4748 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004749 }
4750
4751 /* Do the conversion */
4752 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004754 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4755 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4757 return -1;
4758 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004759 if (pusedDefaultChar && *pusedDefaultChar)
4760 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004761 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004762 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004763
4764mbcs_encode_error:
4765 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4766 Py_XDECREF(exc);
4767 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004768}
4769
4770PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 Py_ssize_t size,
4772 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004773{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004774 PyObject *repr = NULL;
4775 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004776
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004777#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004779 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004780 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004781 else
4782#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004783 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004784
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004785 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 Py_XDECREF(repr);
4787 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004788 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004789
4790#ifdef NEED_RETRY
4791 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 p += INT_MAX;
4793 size -= INT_MAX;
4794 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004795 }
4796#endif
4797
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004798 return repr;
4799}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004800
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004801PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4802{
4803 if (!PyUnicode_Check(unicode)) {
4804 PyErr_BadArgument();
4805 return NULL;
4806 }
4807 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 PyUnicode_GET_SIZE(unicode),
4809 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004810}
4811
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004812#undef NEED_RETRY
4813
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004814#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816/* --- Character Mapping Codec -------------------------------------------- */
4817
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 Py_ssize_t size,
4820 PyObject *mapping,
4821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 Py_ssize_t startinpos;
4825 Py_ssize_t endinpos;
4826 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 PyUnicodeObject *v;
4829 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004830 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 PyObject *errorHandler = NULL;
4832 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004833 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004835
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 /* Default to Latin-1 */
4837 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
4840 v = _PyUnicode_New(size);
4841 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004847 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 mapstring = PyUnicode_AS_UNICODE(mapping);
4849 maplen = PyUnicode_GET_SIZE(mapping);
4850 while (s < e) {
4851 unsigned char ch = *s;
4852 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 if (ch < maplen)
4855 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 if (x == 0xfffe) {
4858 /* undefined mapping */
4859 outpos = p-PyUnicode_AS_UNICODE(v);
4860 startinpos = s-starts;
4861 endinpos = startinpos+1;
4862 if (unicode_decode_call_errorhandler(
4863 errors, &errorHandler,
4864 "charmap", "character maps to <undefined>",
4865 &starts, &e, &startinpos, &endinpos, &exc, &s,
4866 &v, &outpos, &p)) {
4867 goto onError;
4868 }
4869 continue;
4870 }
4871 *p++ = x;
4872 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004873 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004874 }
4875 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 while (s < e) {
4877 unsigned char ch = *s;
4878 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4881 w = PyLong_FromLong((long)ch);
4882 if (w == NULL)
4883 goto onError;
4884 x = PyObject_GetItem(mapping, w);
4885 Py_DECREF(w);
4886 if (x == NULL) {
4887 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4888 /* No mapping found means: mapping is undefined. */
4889 PyErr_Clear();
4890 x = Py_None;
4891 Py_INCREF(x);
4892 } else
4893 goto onError;
4894 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004895
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 /* Apply mapping */
4897 if (PyLong_Check(x)) {
4898 long value = PyLong_AS_LONG(x);
4899 if (value < 0 || value > 65535) {
4900 PyErr_SetString(PyExc_TypeError,
4901 "character mapping must be in range(65536)");
4902 Py_DECREF(x);
4903 goto onError;
4904 }
4905 *p++ = (Py_UNICODE)value;
4906 }
4907 else if (x == Py_None) {
4908 /* undefined mapping */
4909 outpos = p-PyUnicode_AS_UNICODE(v);
4910 startinpos = s-starts;
4911 endinpos = startinpos+1;
4912 if (unicode_decode_call_errorhandler(
4913 errors, &errorHandler,
4914 "charmap", "character maps to <undefined>",
4915 &starts, &e, &startinpos, &endinpos, &exc, &s,
4916 &v, &outpos, &p)) {
4917 Py_DECREF(x);
4918 goto onError;
4919 }
4920 Py_DECREF(x);
4921 continue;
4922 }
4923 else if (PyUnicode_Check(x)) {
4924 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004925
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 if (targetsize == 1)
4927 /* 1-1 mapping */
4928 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004929
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 else if (targetsize > 1) {
4931 /* 1-n mapping */
4932 if (targetsize > extrachars) {
4933 /* resize first */
4934 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4935 Py_ssize_t needed = (targetsize - extrachars) + \
4936 (targetsize << 2);
4937 extrachars += needed;
4938 /* XXX overflow detection missing */
4939 if (_PyUnicode_Resize(&v,
4940 PyUnicode_GET_SIZE(v) + needed) < 0) {
4941 Py_DECREF(x);
4942 goto onError;
4943 }
4944 p = PyUnicode_AS_UNICODE(v) + oldpos;
4945 }
4946 Py_UNICODE_COPY(p,
4947 PyUnicode_AS_UNICODE(x),
4948 targetsize);
4949 p += targetsize;
4950 extrachars -= targetsize;
4951 }
4952 /* 1-0 mapping: skip the character */
4953 }
4954 else {
4955 /* wrong return value */
4956 PyErr_SetString(PyExc_TypeError,
4957 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004958 Py_DECREF(x);
4959 goto onError;
4960 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 Py_DECREF(x);
4962 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 }
4965 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968 Py_XDECREF(errorHandler);
4969 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004971
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 Py_XDECREF(errorHandler);
4974 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 Py_XDECREF(v);
4976 return NULL;
4977}
4978
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004979/* Charmap encoding: the lookup table */
4980
4981struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 PyObject_HEAD
4983 unsigned char level1[32];
4984 int count2, count3;
4985 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004986};
4987
4988static PyObject*
4989encoding_map_size(PyObject *obj, PyObject* args)
4990{
4991 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004992 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004994}
4995
4996static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004997 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 PyDoc_STR("Return the size (in bytes) of this object") },
4999 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005000};
5001
5002static void
5003encoding_map_dealloc(PyObject* o)
5004{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005005 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005006}
5007
5008static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005009 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 "EncodingMap", /*tp_name*/
5011 sizeof(struct encoding_map), /*tp_basicsize*/
5012 0, /*tp_itemsize*/
5013 /* methods */
5014 encoding_map_dealloc, /*tp_dealloc*/
5015 0, /*tp_print*/
5016 0, /*tp_getattr*/
5017 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005018 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 0, /*tp_repr*/
5020 0, /*tp_as_number*/
5021 0, /*tp_as_sequence*/
5022 0, /*tp_as_mapping*/
5023 0, /*tp_hash*/
5024 0, /*tp_call*/
5025 0, /*tp_str*/
5026 0, /*tp_getattro*/
5027 0, /*tp_setattro*/
5028 0, /*tp_as_buffer*/
5029 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5030 0, /*tp_doc*/
5031 0, /*tp_traverse*/
5032 0, /*tp_clear*/
5033 0, /*tp_richcompare*/
5034 0, /*tp_weaklistoffset*/
5035 0, /*tp_iter*/
5036 0, /*tp_iternext*/
5037 encoding_map_methods, /*tp_methods*/
5038 0, /*tp_members*/
5039 0, /*tp_getset*/
5040 0, /*tp_base*/
5041 0, /*tp_dict*/
5042 0, /*tp_descr_get*/
5043 0, /*tp_descr_set*/
5044 0, /*tp_dictoffset*/
5045 0, /*tp_init*/
5046 0, /*tp_alloc*/
5047 0, /*tp_new*/
5048 0, /*tp_free*/
5049 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005050};
5051
5052PyObject*
5053PyUnicode_BuildEncodingMap(PyObject* string)
5054{
5055 Py_UNICODE *decode;
5056 PyObject *result;
5057 struct encoding_map *mresult;
5058 int i;
5059 int need_dict = 0;
5060 unsigned char level1[32];
5061 unsigned char level2[512];
5062 unsigned char *mlevel1, *mlevel2, *mlevel3;
5063 int count2 = 0, count3 = 0;
5064
5065 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5066 PyErr_BadArgument();
5067 return NULL;
5068 }
5069 decode = PyUnicode_AS_UNICODE(string);
5070 memset(level1, 0xFF, sizeof level1);
5071 memset(level2, 0xFF, sizeof level2);
5072
5073 /* If there isn't a one-to-one mapping of NULL to \0,
5074 or if there are non-BMP characters, we need to use
5075 a mapping dictionary. */
5076 if (decode[0] != 0)
5077 need_dict = 1;
5078 for (i = 1; i < 256; i++) {
5079 int l1, l2;
5080 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005081#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005082 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005083#endif
5084 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005085 need_dict = 1;
5086 break;
5087 }
5088 if (decode[i] == 0xFFFE)
5089 /* unmapped character */
5090 continue;
5091 l1 = decode[i] >> 11;
5092 l2 = decode[i] >> 7;
5093 if (level1[l1] == 0xFF)
5094 level1[l1] = count2++;
5095 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005096 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005097 }
5098
5099 if (count2 >= 0xFF || count3 >= 0xFF)
5100 need_dict = 1;
5101
5102 if (need_dict) {
5103 PyObject *result = PyDict_New();
5104 PyObject *key, *value;
5105 if (!result)
5106 return NULL;
5107 for (i = 0; i < 256; i++) {
5108 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005109 key = PyLong_FromLong(decode[i]);
5110 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005111 if (!key || !value)
5112 goto failed1;
5113 if (PyDict_SetItem(result, key, value) == -1)
5114 goto failed1;
5115 Py_DECREF(key);
5116 Py_DECREF(value);
5117 }
5118 return result;
5119 failed1:
5120 Py_XDECREF(key);
5121 Py_XDECREF(value);
5122 Py_DECREF(result);
5123 return NULL;
5124 }
5125
5126 /* Create a three-level trie */
5127 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5128 16*count2 + 128*count3 - 1);
5129 if (!result)
5130 return PyErr_NoMemory();
5131 PyObject_Init(result, &EncodingMapType);
5132 mresult = (struct encoding_map*)result;
5133 mresult->count2 = count2;
5134 mresult->count3 = count3;
5135 mlevel1 = mresult->level1;
5136 mlevel2 = mresult->level23;
5137 mlevel3 = mresult->level23 + 16*count2;
5138 memcpy(mlevel1, level1, 32);
5139 memset(mlevel2, 0xFF, 16*count2);
5140 memset(mlevel3, 0, 128*count3);
5141 count3 = 0;
5142 for (i = 1; i < 256; i++) {
5143 int o1, o2, o3, i2, i3;
5144 if (decode[i] == 0xFFFE)
5145 /* unmapped character */
5146 continue;
5147 o1 = decode[i]>>11;
5148 o2 = (decode[i]>>7) & 0xF;
5149 i2 = 16*mlevel1[o1] + o2;
5150 if (mlevel2[i2] == 0xFF)
5151 mlevel2[i2] = count3++;
5152 o3 = decode[i] & 0x7F;
5153 i3 = 128*mlevel2[i2] + o3;
5154 mlevel3[i3] = i;
5155 }
5156 return result;
5157}
5158
5159static int
5160encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5161{
5162 struct encoding_map *map = (struct encoding_map*)mapping;
5163 int l1 = c>>11;
5164 int l2 = (c>>7) & 0xF;
5165 int l3 = c & 0x7F;
5166 int i;
5167
5168#ifdef Py_UNICODE_WIDE
5169 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005171 }
5172#endif
5173 if (c == 0)
5174 return 0;
5175 /* level 1*/
5176 i = map->level1[l1];
5177 if (i == 0xFF) {
5178 return -1;
5179 }
5180 /* level 2*/
5181 i = map->level23[16*i+l2];
5182 if (i == 0xFF) {
5183 return -1;
5184 }
5185 /* level 3 */
5186 i = map->level23[16*map->count2 + 128*i + l3];
5187 if (i == 0) {
5188 return -1;
5189 }
5190 return i;
5191}
5192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193/* Lookup the character ch in the mapping. If the character
5194 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005195 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
Christian Heimes217cfd12007-12-02 14:31:20 +00005198 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199 PyObject *x;
5200
5201 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 x = PyObject_GetItem(mapping, w);
5204 Py_DECREF(w);
5205 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5207 /* No mapping found means: mapping is undefined. */
5208 PyErr_Clear();
5209 x = Py_None;
5210 Py_INCREF(x);
5211 return x;
5212 } else
5213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005215 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005217 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 long value = PyLong_AS_LONG(x);
5219 if (value < 0 || value > 255) {
5220 PyErr_SetString(PyExc_TypeError,
5221 "character mapping must be in range(256)");
5222 Py_DECREF(x);
5223 return NULL;
5224 }
5225 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005227 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 /* wrong return value */
5231 PyErr_Format(PyExc_TypeError,
5232 "character mapping must return integer, bytes or None, not %.400s",
5233 x->ob_type->tp_name);
5234 Py_DECREF(x);
5235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 }
5237}
5238
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005239static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005240charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005241{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005242 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5243 /* exponentially overallocate to minimize reallocations */
5244 if (requiredsize < 2*outsize)
5245 requiredsize = 2*outsize;
5246 if (_PyBytes_Resize(outobj, requiredsize))
5247 return -1;
5248 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005249}
5250
Benjamin Peterson14339b62009-01-31 16:36:08 +00005251typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005253}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005255 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256 space is available. Return a new reference to the object that
5257 was put in the output buffer, or Py_None, if the mapping was undefined
5258 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005259 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005260static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005261charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005264 PyObject *rep;
5265 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005266 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267
Christian Heimes90aa7642007-12-19 02:45:37 +00005268 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005269 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005271 if (res == -1)
5272 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 if (outsize<requiredsize)
5274 if (charmapencode_resize(outobj, outpos, requiredsize))
5275 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005276 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 outstart[(*outpos)++] = (char)res;
5278 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005279 }
5280
5281 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005284 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 Py_DECREF(rep);
5286 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005287 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 if (PyLong_Check(rep)) {
5289 Py_ssize_t requiredsize = *outpos+1;
5290 if (outsize<requiredsize)
5291 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5292 Py_DECREF(rep);
5293 return enc_EXCEPTION;
5294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005295 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 else {
5299 const char *repchars = PyBytes_AS_STRING(rep);
5300 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5301 Py_ssize_t requiredsize = *outpos+repsize;
5302 if (outsize<requiredsize)
5303 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5304 Py_DECREF(rep);
5305 return enc_EXCEPTION;
5306 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005307 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 memcpy(outstart + *outpos, repchars, repsize);
5309 *outpos += repsize;
5310 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005312 Py_DECREF(rep);
5313 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314}
5315
5316/* handle an error in PyUnicode_EncodeCharmap
5317 Return 0 on success, -1 on error */
5318static
5319int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005320 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005322 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005323 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324{
5325 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t repsize;
5327 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005328 Py_UNICODE *uni2;
5329 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t collstartpos = *inpos;
5331 Py_ssize_t collendpos = *inpos+1;
5332 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333 char *encoding = "charmap";
5334 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005335 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 /* find all unencodable characters */
5338 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005339 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005340 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 int res = encoding_map_lookup(p[collendpos], mapping);
5342 if (res != -1)
5343 break;
5344 ++collendpos;
5345 continue;
5346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005347
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 rep = charmapencode_lookup(p[collendpos], mapping);
5349 if (rep==NULL)
5350 return -1;
5351 else if (rep!=Py_None) {
5352 Py_DECREF(rep);
5353 break;
5354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005355 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 }
5358 /* cache callback name lookup
5359 * (if not done yet, i.e. it's the first error) */
5360 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 if ((errors==NULL) || (!strcmp(errors, "strict")))
5362 *known_errorHandler = 1;
5363 else if (!strcmp(errors, "replace"))
5364 *known_errorHandler = 2;
5365 else if (!strcmp(errors, "ignore"))
5366 *known_errorHandler = 3;
5367 else if (!strcmp(errors, "xmlcharrefreplace"))
5368 *known_errorHandler = 4;
5369 else
5370 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 }
5372 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005373 case 1: /* strict */
5374 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5375 return -1;
5376 case 2: /* replace */
5377 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 x = charmapencode_output('?', mapping, res, respos);
5379 if (x==enc_EXCEPTION) {
5380 return -1;
5381 }
5382 else if (x==enc_FAILED) {
5383 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5384 return -1;
5385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005386 }
5387 /* fall through */
5388 case 3: /* ignore */
5389 *inpos = collendpos;
5390 break;
5391 case 4: /* xmlcharrefreplace */
5392 /* generate replacement (temporarily (mis)uses p) */
5393 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 char buffer[2+29+1+1];
5395 char *cp;
5396 sprintf(buffer, "&#%d;", (int)p[collpos]);
5397 for (cp = buffer; *cp; ++cp) {
5398 x = charmapencode_output(*cp, mapping, res, respos);
5399 if (x==enc_EXCEPTION)
5400 return -1;
5401 else if (x==enc_FAILED) {
5402 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5403 return -1;
5404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005405 }
5406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005407 *inpos = collendpos;
5408 break;
5409 default:
5410 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 encoding, reason, p, size, exceptionObject,
5412 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005415 if (PyBytes_Check(repunicode)) {
5416 /* Directly copy bytes result to output. */
5417 Py_ssize_t outsize = PyBytes_Size(*res);
5418 Py_ssize_t requiredsize;
5419 repsize = PyBytes_Size(repunicode);
5420 requiredsize = *respos + repsize;
5421 if (requiredsize > outsize)
5422 /* Make room for all additional bytes. */
5423 if (charmapencode_resize(res, respos, requiredsize)) {
5424 Py_DECREF(repunicode);
5425 return -1;
5426 }
5427 memcpy(PyBytes_AsString(*res) + *respos,
5428 PyBytes_AsString(repunicode), repsize);
5429 *respos += repsize;
5430 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005431 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005432 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005434 /* generate replacement */
5435 repsize = PyUnicode_GET_SIZE(repunicode);
5436 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 x = charmapencode_output(*uni2, mapping, res, respos);
5438 if (x==enc_EXCEPTION) {
5439 return -1;
5440 }
5441 else if (x==enc_FAILED) {
5442 Py_DECREF(repunicode);
5443 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5444 return -1;
5445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005446 }
5447 *inpos = newpos;
5448 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 }
5450 return 0;
5451}
5452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 Py_ssize_t size,
5455 PyObject *mapping,
5456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 /* output object */
5459 PyObject *res = NULL;
5460 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005463 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 PyObject *errorHandler = NULL;
5465 PyObject *exc = NULL;
5466 /* the following variable is used for caching string comparisons
5467 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5468 * 3=ignore, 4=xmlcharrefreplace */
5469 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470
5471 /* Default to Latin-1 */
5472 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 /* allocate enough for a simple encoding without
5476 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005477 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 if (res == NULL)
5479 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005480 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 /* try to encode it */
5485 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5486 if (x==enc_EXCEPTION) /* error */
5487 goto onError;
5488 if (x==enc_FAILED) { /* unencodable character */
5489 if (charmap_encoding_error(p, size, &inpos, mapping,
5490 &exc,
5491 &known_errorHandler, &errorHandler, errors,
5492 &res, &respos)) {
5493 goto onError;
5494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005495 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 else
5497 /* done with this character => adjust input position */
5498 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005502 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005503 if (_PyBytes_Resize(&res, respos) < 0)
5504 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 Py_XDECREF(exc);
5507 Py_XDECREF(errorHandler);
5508 return res;
5509
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 Py_XDECREF(res);
5512 Py_XDECREF(exc);
5513 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 return NULL;
5515}
5516
5517PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519{
5520 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 PyErr_BadArgument();
5522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 }
5524 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 PyUnicode_GET_SIZE(unicode),
5526 mapping,
5527 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528}
5529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530/* create or adjust a UnicodeTranslateError */
5531static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 const Py_UNICODE *unicode, Py_ssize_t size,
5533 Py_ssize_t startpos, Py_ssize_t endpos,
5534 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005537 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
5540 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5542 goto onError;
5543 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5544 goto onError;
5545 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5546 goto onError;
5547 return;
5548 onError:
5549 Py_DECREF(*exceptionObject);
5550 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 }
5552}
5553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554/* raises a UnicodeTranslateError */
5555static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 const Py_UNICODE *unicode, Py_ssize_t size,
5557 Py_ssize_t startpos, Py_ssize_t endpos,
5558 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559{
5560 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564}
5565
5566/* error handling callback helper:
5567 build arguments, call the callback and check the arguments,
5568 put the result into newpos and return the replacement string, which
5569 has to be freed by the caller */
5570static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 PyObject **errorHandler,
5572 const char *reason,
5573 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5574 Py_ssize_t startpos, Py_ssize_t endpos,
5575 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005577 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005579 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 PyObject *restuple;
5581 PyObject *resunicode;
5582
5583 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 }
5588
5589 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593
5594 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005599 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 Py_DECREF(restuple);
5601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 }
5603 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 &resunicode, &i_newpos)) {
5605 Py_DECREF(restuple);
5606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005608 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 else
5611 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005612 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5614 Py_DECREF(restuple);
5615 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 Py_INCREF(resunicode);
5618 Py_DECREF(restuple);
5619 return resunicode;
5620}
5621
5622/* Lookup the character ch in the mapping and put the result in result,
5623 which must be decrefed by the caller.
5624 Return 0 on success, -1 on error */
5625static
5626int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5627{
Christian Heimes217cfd12007-12-02 14:31:20 +00005628 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 PyObject *x;
5630
5631 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 x = PyObject_GetItem(mapping, w);
5634 Py_DECREF(w);
5635 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5637 /* No mapping found means: use 1:1 mapping. */
5638 PyErr_Clear();
5639 *result = NULL;
5640 return 0;
5641 } else
5642 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 }
5644 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 *result = x;
5646 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005648 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 long value = PyLong_AS_LONG(x);
5650 long max = PyUnicode_GetMax();
5651 if (value < 0 || value > max) {
5652 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005653 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 Py_DECREF(x);
5655 return -1;
5656 }
5657 *result = x;
5658 return 0;
5659 }
5660 else if (PyUnicode_Check(x)) {
5661 *result = x;
5662 return 0;
5663 }
5664 else {
5665 /* wrong return value */
5666 PyErr_SetString(PyExc_TypeError,
5667 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005668 Py_DECREF(x);
5669 return -1;
5670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671}
5672/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 if not reallocate and adjust various state variables.
5674 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675static
Walter Dörwald4894c302003-10-24 14:25:28 +00005676int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005679 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005680 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 /* remember old output position */
5682 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5683 /* exponentially overallocate to minimize reallocations */
5684 if (requiredsize < 2 * oldsize)
5685 requiredsize = 2 * oldsize;
5686 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5687 return -1;
5688 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 }
5690 return 0;
5691}
5692/* lookup the character, put the result in the output string and adjust
5693 various state variables. Return a new reference to the object that
5694 was put in the output buffer in *result, or Py_None, if the mapping was
5695 undefined (in which case no character was written).
5696 The called must decref result.
5697 Return 0 on success, -1 on error. */
5698static
Walter Dörwald4894c302003-10-24 14:25:28 +00005699int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5701 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702{
Walter Dörwald4894c302003-10-24 14:25:28 +00005703 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* not found => default to 1:1 mapping */
5707 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 }
5709 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005711 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 /* no overflow check, because we know that the space is enough */
5713 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714 }
5715 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5717 if (repsize==1) {
5718 /* no overflow check, because we know that the space is enough */
5719 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5720 }
5721 else if (repsize!=0) {
5722 /* more than one character */
5723 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5724 (insize - (curinp-startinp)) +
5725 repsize - 1;
5726 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5727 return -1;
5728 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5729 *outp += repsize;
5730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 }
5732 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 return 0;
5735}
5736
5737PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 Py_ssize_t size,
5739 PyObject *mapping,
5740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 /* output object */
5743 PyObject *res = NULL;
5744 /* pointers to the beginning and end+1 of input */
5745 const Py_UNICODE *startp = p;
5746 const Py_UNICODE *endp = p + size;
5747 /* pointer into the output */
5748 Py_UNICODE *str;
5749 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 char *reason = "character maps to <undefined>";
5752 PyObject *errorHandler = NULL;
5753 PyObject *exc = NULL;
5754 /* the following variable is used for caching string comparisons
5755 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5756 * 3=ignore, 4=xmlcharrefreplace */
5757 int known_errorHandler = -1;
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 PyErr_BadArgument();
5761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763
5764 /* allocate enough for a simple 1:1 translation without
5765 replacements, if we need more, we'll resize */
5766 res = PyUnicode_FromUnicode(NULL, size);
5767 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 /* try to encode it */
5775 PyObject *x = NULL;
5776 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5777 Py_XDECREF(x);
5778 goto onError;
5779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005780 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 if (x!=Py_None) /* it worked => adjust input pointer */
5782 ++p;
5783 else { /* untranslatable character */
5784 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5785 Py_ssize_t repsize;
5786 Py_ssize_t newpos;
5787 Py_UNICODE *uni2;
5788 /* startpos for collecting untranslatable chars */
5789 const Py_UNICODE *collstart = p;
5790 const Py_UNICODE *collend = p+1;
5791 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* find all untranslatable characters */
5794 while (collend < endp) {
5795 if (charmaptranslate_lookup(*collend, mapping, &x))
5796 goto onError;
5797 Py_XDECREF(x);
5798 if (x!=Py_None)
5799 break;
5800 ++collend;
5801 }
5802 /* cache callback name lookup
5803 * (if not done yet, i.e. it's the first error) */
5804 if (known_errorHandler==-1) {
5805 if ((errors==NULL) || (!strcmp(errors, "strict")))
5806 known_errorHandler = 1;
5807 else if (!strcmp(errors, "replace"))
5808 known_errorHandler = 2;
5809 else if (!strcmp(errors, "ignore"))
5810 known_errorHandler = 3;
5811 else if (!strcmp(errors, "xmlcharrefreplace"))
5812 known_errorHandler = 4;
5813 else
5814 known_errorHandler = 0;
5815 }
5816 switch (known_errorHandler) {
5817 case 1: /* strict */
5818 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 case 2: /* replace */
5821 /* No need to check for space, this is a 1:1 replacement */
5822 for (coll = collstart; coll<collend; ++coll)
5823 *str++ = '?';
5824 /* fall through */
5825 case 3: /* ignore */
5826 p = collend;
5827 break;
5828 case 4: /* xmlcharrefreplace */
5829 /* generate replacement (temporarily (mis)uses p) */
5830 for (p = collstart; p < collend; ++p) {
5831 char buffer[2+29+1+1];
5832 char *cp;
5833 sprintf(buffer, "&#%d;", (int)*p);
5834 if (charmaptranslate_makespace(&res, &str,
5835 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5836 goto onError;
5837 for (cp = buffer; *cp; ++cp)
5838 *str++ = *cp;
5839 }
5840 p = collend;
5841 break;
5842 default:
5843 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5844 reason, startp, size, &exc,
5845 collstart-startp, collend-startp, &newpos);
5846 if (repunicode == NULL)
5847 goto onError;
5848 /* generate replacement */
5849 repsize = PyUnicode_GET_SIZE(repunicode);
5850 if (charmaptranslate_makespace(&res, &str,
5851 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5852 Py_DECREF(repunicode);
5853 goto onError;
5854 }
5855 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5856 *str++ = *uni2;
5857 p = startp + newpos;
5858 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005859 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005860 }
5861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 /* Resize if we allocated to much */
5863 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005864 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 if (PyUnicode_Resize(&res, respos) < 0)
5866 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 }
5868 Py_XDECREF(exc);
5869 Py_XDECREF(errorHandler);
5870 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 Py_XDECREF(res);
5874 Py_XDECREF(exc);
5875 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 return NULL;
5877}
5878
5879PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 PyObject *mapping,
5881 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882{
5883 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005884
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 str = PyUnicode_FromObject(str);
5886 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 PyUnicode_GET_SIZE(str),
5890 mapping,
5891 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 Py_DECREF(str);
5893 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005894
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 Py_XDECREF(str);
5897 return NULL;
5898}
Tim Petersced69f82003-09-16 20:30:58 +00005899
Guido van Rossum9e896b32000-04-05 20:11:21 +00005900/* --- Decimal Encoder ---------------------------------------------------- */
5901
5902int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 Py_ssize_t length,
5904 char *output,
5905 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005906{
5907 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 PyObject *errorHandler = NULL;
5909 PyObject *exc = NULL;
5910 const char *encoding = "decimal";
5911 const char *reason = "invalid decimal Unicode string";
5912 /* the following variable is used for caching string comparisons
5913 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5914 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005915
5916 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 PyErr_BadArgument();
5918 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005919 }
5920
5921 p = s;
5922 end = s + length;
5923 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 register Py_UNICODE ch = *p;
5925 int decimal;
5926 PyObject *repunicode;
5927 Py_ssize_t repsize;
5928 Py_ssize_t newpos;
5929 Py_UNICODE *uni2;
5930 Py_UNICODE *collstart;
5931 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005934 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 ++p;
5936 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005937 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 decimal = Py_UNICODE_TODECIMAL(ch);
5939 if (decimal >= 0) {
5940 *output++ = '0' + decimal;
5941 ++p;
5942 continue;
5943 }
5944 if (0 < ch && ch < 256) {
5945 *output++ = (char)ch;
5946 ++p;
5947 continue;
5948 }
5949 /* All other characters are considered unencodable */
5950 collstart = p;
5951 collend = p+1;
5952 while (collend < end) {
5953 if ((0 < *collend && *collend < 256) ||
5954 !Py_UNICODE_ISSPACE(*collend) ||
5955 Py_UNICODE_TODECIMAL(*collend))
5956 break;
5957 }
5958 /* cache callback name lookup
5959 * (if not done yet, i.e. it's the first error) */
5960 if (known_errorHandler==-1) {
5961 if ((errors==NULL) || (!strcmp(errors, "strict")))
5962 known_errorHandler = 1;
5963 else if (!strcmp(errors, "replace"))
5964 known_errorHandler = 2;
5965 else if (!strcmp(errors, "ignore"))
5966 known_errorHandler = 3;
5967 else if (!strcmp(errors, "xmlcharrefreplace"))
5968 known_errorHandler = 4;
5969 else
5970 known_errorHandler = 0;
5971 }
5972 switch (known_errorHandler) {
5973 case 1: /* strict */
5974 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5975 goto onError;
5976 case 2: /* replace */
5977 for (p = collstart; p < collend; ++p)
5978 *output++ = '?';
5979 /* fall through */
5980 case 3: /* ignore */
5981 p = collend;
5982 break;
5983 case 4: /* xmlcharrefreplace */
5984 /* generate replacement (temporarily (mis)uses p) */
5985 for (p = collstart; p < collend; ++p)
5986 output += sprintf(output, "&#%d;", (int)*p);
5987 p = collend;
5988 break;
5989 default:
5990 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5991 encoding, reason, s, length, &exc,
5992 collstart-s, collend-s, &newpos);
5993 if (repunicode == NULL)
5994 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005995 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005996 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005997 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5998 Py_DECREF(repunicode);
5999 goto onError;
6000 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* generate replacement */
6002 repsize = PyUnicode_GET_SIZE(repunicode);
6003 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6004 Py_UNICODE ch = *uni2;
6005 if (Py_UNICODE_ISSPACE(ch))
6006 *output++ = ' ';
6007 else {
6008 decimal = Py_UNICODE_TODECIMAL(ch);
6009 if (decimal >= 0)
6010 *output++ = '0' + decimal;
6011 else if (0 < ch && ch < 256)
6012 *output++ = (char)ch;
6013 else {
6014 Py_DECREF(repunicode);
6015 raise_encode_exception(&exc, encoding,
6016 s, length, collstart-s, collend-s, reason);
6017 goto onError;
6018 }
6019 }
6020 }
6021 p = s + newpos;
6022 Py_DECREF(repunicode);
6023 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006024 }
6025 /* 0-terminate the output string */
6026 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027 Py_XDECREF(exc);
6028 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006029 return 0;
6030
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 Py_XDECREF(exc);
6033 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006034 return -1;
6035}
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037/* --- Helpers ------------------------------------------------------------ */
6038
Eric Smith8c663262007-08-25 02:26:07 +00006039#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006040#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006041
Thomas Wouters477c8d52006-05-27 19:21:47 +00006042#include "stringlib/count.h"
6043#include "stringlib/find.h"
6044#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006045#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006046
Eric Smith5807c412008-05-11 21:00:57 +00006047#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006048#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006049#include "stringlib/localeutil.h"
6050
Thomas Wouters477c8d52006-05-27 19:21:47 +00006051/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006052#define ADJUST_INDICES(start, end, len) \
6053 if (end > len) \
6054 end = len; \
6055 else if (end < 0) { \
6056 end += len; \
6057 if (end < 0) \
6058 end = 0; \
6059 } \
6060 if (start < 0) { \
6061 start += len; \
6062 if (start < 0) \
6063 start = 0; \
6064 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006065
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006067 PyObject *substr,
6068 Py_ssize_t start,
6069 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006071 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006072 PyUnicodeObject* str_obj;
6073 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006074
Thomas Wouters477c8d52006-05-27 19:21:47 +00006075 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6076 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6079 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 Py_DECREF(str_obj);
6081 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 }
Tim Petersced69f82003-09-16 20:30:58 +00006083
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006084 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006085 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006086 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6087 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006088 );
6089
6090 Py_DECREF(sub_obj);
6091 Py_DECREF(str_obj);
6092
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 return result;
6094}
6095
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006097 PyObject *sub,
6098 Py_ssize_t start,
6099 Py_ssize_t end,
6100 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006103
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006105 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006107 sub = PyUnicode_FromObject(sub);
6108 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 Py_DECREF(str);
6110 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 }
Tim Petersced69f82003-09-16 20:30:58 +00006112
Thomas Wouters477c8d52006-05-27 19:21:47 +00006113 if (direction > 0)
6114 result = stringlib_find_slice(
6115 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6116 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6117 start, end
6118 );
6119 else
6120 result = stringlib_rfind_slice(
6121 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6122 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6123 start, end
6124 );
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 Py_DECREF(sub);
6128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return result;
6130}
6131
Tim Petersced69f82003-09-16 20:30:58 +00006132static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 PyUnicodeObject *substring,
6135 Py_ssize_t start,
6136 Py_ssize_t end,
6137 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 if (substring->length == 0)
6140 return 1;
6141
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006142 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 end -= substring->length;
6144 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 if (Py_UNICODE_MATCH(self, end, substring))
6149 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 } else {
6151 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
6154
6155 return 0;
6156}
6157
Martin v. Löwis18e16552006-02-15 17:27:45 +00006158Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 PyObject *substr,
6160 Py_ssize_t start,
6161 Py_ssize_t end,
6162 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006164 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006165
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 str = PyUnicode_FromObject(str);
6167 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 substr = PyUnicode_FromObject(substr);
6170 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 Py_DECREF(str);
6172 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
Tim Petersced69f82003-09-16 20:30:58 +00006174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 (PyUnicodeObject *)substr,
6177 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 Py_DECREF(str);
6179 Py_DECREF(substr);
6180 return result;
6181}
6182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183/* Apply fixfct filter to the Unicode object self and return a
6184 reference to the modified object */
6185
Tim Petersced69f82003-09-16 20:30:58 +00006186static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
6190
6191 PyUnicodeObject *u;
6192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006193 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006196
6197 Py_UNICODE_COPY(u->str, self->str, self->length);
6198
Tim Peters7a29bd52001-09-12 03:03:31 +00006199 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* fixfct should return TRUE if it modified the buffer. If
6201 FALSE, return a reference to the original buffer instead
6202 (to save space, not time) */
6203 Py_INCREF(self);
6204 Py_DECREF(u);
6205 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
6207 return (PyObject*) u;
6208}
6209
Tim Petersced69f82003-09-16 20:30:58 +00006210static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211int fixupper(PyUnicodeObject *self)
6212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006213 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 Py_UNICODE *s = self->str;
6215 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 ch = Py_UNICODE_TOUPPER(*s);
6221 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 *s = ch;
6224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 s++;
6226 }
6227
6228 return status;
6229}
6230
Tim Petersced69f82003-09-16 20:30:58 +00006231static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232int fixlower(PyUnicodeObject *self)
6233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006234 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 Py_UNICODE *s = self->str;
6236 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006240
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 ch = Py_UNICODE_TOLOWER(*s);
6242 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 *s = ch;
6245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 s++;
6247 }
6248
6249 return status;
6250}
6251
Tim Petersced69f82003-09-16 20:30:58 +00006252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253int fixswapcase(PyUnicodeObject *self)
6254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 Py_UNICODE *s = self->str;
6257 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006258
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 while (len-- > 0) {
6260 if (Py_UNICODE_ISUPPER(*s)) {
6261 *s = Py_UNICODE_TOLOWER(*s);
6262 status = 1;
6263 } else if (Py_UNICODE_ISLOWER(*s)) {
6264 *s = Py_UNICODE_TOUPPER(*s);
6265 status = 1;
6266 }
6267 s++;
6268 }
6269
6270 return status;
6271}
6272
Tim Petersced69f82003-09-16 20:30:58 +00006273static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274int fixcapitalize(PyUnicodeObject *self)
6275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006276 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006277 Py_UNICODE *s = self->str;
6278 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006280 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006282 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 *s = Py_UNICODE_TOUPPER(*s);
6284 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006286 s++;
6287 while (--len > 0) {
6288 if (Py_UNICODE_ISUPPER(*s)) {
6289 *s = Py_UNICODE_TOLOWER(*s);
6290 status = 1;
6291 }
6292 s++;
6293 }
6294 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295}
6296
6297static
6298int fixtitle(PyUnicodeObject *self)
6299{
6300 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6301 register Py_UNICODE *e;
6302 int previous_is_cased;
6303
6304 /* Shortcut for single character strings */
6305 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6307 if (*p != ch) {
6308 *p = ch;
6309 return 1;
6310 }
6311 else
6312 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 }
Tim Petersced69f82003-09-16 20:30:58 +00006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 e = p + PyUnicode_GET_SIZE(self);
6316 previous_is_cased = 0;
6317 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 if (previous_is_cased)
6321 *p = Py_UNICODE_TOLOWER(ch);
6322 else
6323 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 if (Py_UNICODE_ISLOWER(ch) ||
6326 Py_UNICODE_ISUPPER(ch) ||
6327 Py_UNICODE_ISTITLE(ch))
6328 previous_is_cased = 1;
6329 else
6330 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 }
6332 return 1;
6333}
6334
Tim Peters8ce9f162004-08-27 01:49:32 +00006335PyObject *
6336PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Skip Montanaro6543b452004-09-16 03:28:13 +00006338 const Py_UNICODE blank = ' ';
6339 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006340 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006341 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006342 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6343 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006344 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6345 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006346 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006347 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
Tim Peters05eba1f2004-08-27 21:32:02 +00006349 fseq = PySequence_Fast(seq, "");
6350 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006351 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006352 }
6353
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006354 /* NOTE: the following code can't call back into Python code,
6355 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006356 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006357
Tim Peters05eba1f2004-08-27 21:32:02 +00006358 seqlen = PySequence_Fast_GET_SIZE(fseq);
6359 /* If empty sequence, return u"". */
6360 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006361 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6362 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006363 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006364 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006365 /* If singleton sequence with an exact Unicode, return that. */
6366 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 item = items[0];
6368 if (PyUnicode_CheckExact(item)) {
6369 Py_INCREF(item);
6370 res = (PyUnicodeObject *)item;
6371 goto Done;
6372 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006373 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006374 else {
6375 /* Set up sep and seplen */
6376 if (separator == NULL) {
6377 sep = &blank;
6378 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006379 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006380 else {
6381 if (!PyUnicode_Check(separator)) {
6382 PyErr_Format(PyExc_TypeError,
6383 "separator: expected str instance,"
6384 " %.80s found",
6385 Py_TYPE(separator)->tp_name);
6386 goto onError;
6387 }
6388 sep = PyUnicode_AS_UNICODE(separator);
6389 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006390 }
6391 }
6392
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006393 /* There are at least two things to join, or else we have a subclass
6394 * of str in the sequence.
6395 * Do a pre-pass to figure out the total amount of space we'll
6396 * need (sz), and see whether all argument are strings.
6397 */
6398 sz = 0;
6399 for (i = 0; i < seqlen; i++) {
6400 const Py_ssize_t old_sz = sz;
6401 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 if (!PyUnicode_Check(item)) {
6403 PyErr_Format(PyExc_TypeError,
6404 "sequence item %zd: expected str instance,"
6405 " %.80s found",
6406 i, Py_TYPE(item)->tp_name);
6407 goto onError;
6408 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006409 sz += PyUnicode_GET_SIZE(item);
6410 if (i != 0)
6411 sz += seplen;
6412 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6413 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006415 goto onError;
6416 }
6417 }
Tim Petersced69f82003-09-16 20:30:58 +00006418
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006419 res = _PyUnicode_New(sz);
6420 if (res == NULL)
6421 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006422
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006423 /* Catenate everything. */
6424 res_p = PyUnicode_AS_UNICODE(res);
6425 for (i = 0; i < seqlen; ++i) {
6426 Py_ssize_t itemlen;
6427 item = items[i];
6428 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 /* Copy item, and maybe the separator. */
6430 if (i) {
6431 Py_UNICODE_COPY(res_p, sep, seplen);
6432 res_p += seplen;
6433 }
6434 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6435 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006436 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006439 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 return (PyObject *)res;
6441
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006443 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006444 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 return NULL;
6446}
6447
Tim Petersced69f82003-09-16 20:30:58 +00006448static
6449PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 Py_ssize_t left,
6451 Py_ssize_t right,
6452 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
6454 PyUnicodeObject *u;
6455
6456 if (left < 0)
6457 left = 0;
6458 if (right < 0)
6459 right = 0;
6460
Tim Peters7a29bd52001-09-12 03:03:31 +00006461 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 Py_INCREF(self);
6463 return self;
6464 }
6465
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006466 if (left > PY_SSIZE_T_MAX - self->length ||
6467 right > PY_SSIZE_T_MAX - (left + self->length)) {
6468 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6469 return NULL;
6470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 u = _PyUnicode_New(left + self->length + right);
6472 if (u) {
6473 if (left)
6474 Py_UNICODE_FILL(u->str, fill, left);
6475 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6476 if (right)
6477 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6478 }
6479
6480 return u;
6481}
6482
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006483PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487 string = PyUnicode_FromObject(string);
6488 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006491 list = stringlib_splitlines(
6492 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6493 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
6495 Py_DECREF(string);
6496 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497}
6498
Tim Petersced69f82003-09-16 20:30:58 +00006499static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 PyUnicodeObject *substring,
6502 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006505 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006508 return stringlib_split_whitespace(
6509 (PyObject*) self, self->str, self->length, maxcount
6510 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006512 return stringlib_split(
6513 (PyObject*) self, self->str, self->length,
6514 substring->str, substring->length,
6515 maxcount
6516 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517}
6518
Tim Petersced69f82003-09-16 20:30:58 +00006519static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006520PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 PyUnicodeObject *substring,
6522 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006523{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006524 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006525 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006526
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006527 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006528 return stringlib_rsplit_whitespace(
6529 (PyObject*) self, self->str, self->length, maxcount
6530 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006531
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006532 return stringlib_rsplit(
6533 (PyObject*) self, self->str, self->length,
6534 substring->str, substring->length,
6535 maxcount
6536 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006537}
6538
6539static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyUnicodeObject *str1,
6542 PyUnicodeObject *str2,
6543 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
6545 PyUnicodeObject *u;
6546
6547 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006549 else if (maxcount == 0 || self->length == 0)
6550 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
Thomas Wouters477c8d52006-05-27 19:21:47 +00006552 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006553 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006554 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006555 if (str1->length == 0)
6556 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006557 if (str1->length == 1) {
6558 /* replace characters */
6559 Py_UNICODE u1, u2;
6560 if (!findchar(self->str, self->length, str1->str[0]))
6561 goto nothing;
6562 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6563 if (!u)
6564 return NULL;
6565 Py_UNICODE_COPY(u->str, self->str, self->length);
6566 u1 = str1->str[0];
6567 u2 = str2->str[0];
6568 for (i = 0; i < u->length; i++)
6569 if (u->str[i] == u1) {
6570 if (--maxcount < 0)
6571 break;
6572 u->str[i] = u2;
6573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006575 i = stringlib_find(
6576 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 if (i < 0)
6579 goto nothing;
6580 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6581 if (!u)
6582 return NULL;
6583 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006584
6585 /* change everything in-place, starting with this one */
6586 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6587 i += str1->length;
6588
6589 while ( --maxcount > 0) {
6590 i = stringlib_find(self->str+i, self->length-i,
6591 str1->str, str1->length,
6592 i);
6593 if (i == -1)
6594 break;
6595 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6596 i += str1->length;
6597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006600
6601 Py_ssize_t n, i, j, e;
6602 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 Py_UNICODE *p;
6604
6605 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006606 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6607 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006608 if (n == 0)
6609 goto nothing;
6610 /* new_size = self->length + n * (str2->length - str1->length)); */
6611 delta = (str2->length - str1->length);
6612 if (delta == 0) {
6613 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006615 product = n * (str2->length - str1->length);
6616 if ((product / (str2->length - str1->length)) != n) {
6617 PyErr_SetString(PyExc_OverflowError,
6618 "replace string is too long");
6619 return NULL;
6620 }
6621 new_size = self->length + product;
6622 if (new_size < 0) {
6623 PyErr_SetString(PyExc_OverflowError,
6624 "replace string is too long");
6625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 }
6627 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006628 u = _PyUnicode_New(new_size);
6629 if (!u)
6630 return NULL;
6631 i = 0;
6632 p = u->str;
6633 e = self->length - str1->length;
6634 if (str1->length > 0) {
6635 while (n-- > 0) {
6636 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006637 j = stringlib_find(self->str+i, self->length-i,
6638 str1->str, str1->length,
6639 i);
6640 if (j == -1)
6641 break;
6642 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006643 /* copy unchanged part [i:j] */
6644 Py_UNICODE_COPY(p, self->str+i, j-i);
6645 p += j - i;
6646 }
6647 /* copy substitution string */
6648 if (str2->length > 0) {
6649 Py_UNICODE_COPY(p, str2->str, str2->length);
6650 p += str2->length;
6651 }
6652 i = j + str1->length;
6653 }
6654 if (i < self->length)
6655 /* copy tail [i:] */
6656 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6657 } else {
6658 /* interleave */
6659 while (n > 0) {
6660 Py_UNICODE_COPY(p, str2->str, str2->length);
6661 p += str2->length;
6662 if (--n <= 0)
6663 break;
6664 *p++ = self->str[i++];
6665 }
6666 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006670
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006672 /* nothing to replace; return original string (when possible) */
6673 if (PyUnicode_CheckExact(self)) {
6674 Py_INCREF(self);
6675 return (PyObject *) self;
6676 }
6677 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
6680/* --- Unicode Object Methods --------------------------------------------- */
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684\n\
6685Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006686characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006689unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 return fixup(self, fixtitle);
6692}
6693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006694PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696\n\
6697Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006698have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006701unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 return fixup(self, fixcapitalize);
6704}
6705
6706#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006707PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709\n\
6710Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
6713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006714unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715{
6716 PyObject *list;
6717 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006718 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 /* Split into words */
6721 list = split(self, NULL, -1);
6722 if (!list)
6723 return NULL;
6724
6725 /* Capitalize each word */
6726 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6727 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 if (item == NULL)
6730 goto onError;
6731 Py_DECREF(PyList_GET_ITEM(list, i));
6732 PyList_SET_ITEM(list, i, item);
6733 }
6734
6735 /* Join the words to form a new string */
6736 item = PyUnicode_Join(NULL, list);
6737
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 Py_DECREF(list);
6740 return (PyObject *)item;
6741}
6742#endif
6743
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006744/* Argument converter. Coerces to a single unicode character */
6745
6746static int
6747convert_uc(PyObject *obj, void *addr)
6748{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006749 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6750 PyObject *uniobj;
6751 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006752
Benjamin Peterson14339b62009-01-31 16:36:08 +00006753 uniobj = PyUnicode_FromObject(obj);
6754 if (uniobj == NULL) {
6755 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757 return 0;
6758 }
6759 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6760 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006762 Py_DECREF(uniobj);
6763 return 0;
6764 }
6765 unistr = PyUnicode_AS_UNICODE(uniobj);
6766 *fillcharloc = unistr[0];
6767 Py_DECREF(uniobj);
6768 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006769}
6770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006774Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006775done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776
6777static PyObject *
6778unicode_center(PyUnicodeObject *self, PyObject *args)
6779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006780 Py_ssize_t marg, left;
6781 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006782 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Thomas Woutersde017742006-02-16 19:34:37 +00006784 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 return NULL;
6786
Tim Peters7a29bd52001-09-12 03:03:31 +00006787 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 Py_INCREF(self);
6789 return (PyObject*) self;
6790 }
6791
6792 marg = width - self->length;
6793 left = marg / 2 + (marg & width & 1);
6794
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006795 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Marc-André Lemburge5034372000-08-08 08:04:29 +00006798#if 0
6799
6800/* This code should go into some future Unicode collation support
6801 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006802 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006803
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006804/* speedy UTF-16 code point order comparison */
6805/* gleaned from: */
6806/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6807
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006808static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006809{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006810 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006811 0, 0, 0, 0, 0, 0, 0, 0,
6812 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006813 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006814};
6815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816static int
6817unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006819 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 Py_UNICODE *s1 = str1->str;
6822 Py_UNICODE *s2 = str2->str;
6823
6824 len1 = str1->length;
6825 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006828 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006829
6830 c1 = *s1++;
6831 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006832
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 if (c1 > (1<<11) * 26)
6834 c1 += utf16Fixup[c1>>11];
6835 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006836 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006837 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006838
6839 if (c1 != c2)
6840 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006842 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 }
6844
6845 return (len1 < len2) ? -1 : (len1 != len2);
6846}
6847
Marc-André Lemburge5034372000-08-08 08:04:29 +00006848#else
6849
6850static int
6851unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006853 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006854
6855 Py_UNICODE *s1 = str1->str;
6856 Py_UNICODE *s2 = str2->str;
6857
6858 len1 = str1->length;
6859 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006860
Marc-André Lemburge5034372000-08-08 08:04:29 +00006861 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006862 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006863
Fredrik Lundh45714e92001-06-26 16:39:36 +00006864 c1 = *s1++;
6865 c2 = *s2++;
6866
6867 if (c1 != c2)
6868 return (c1 < c2) ? -1 : 1;
6869
Marc-André Lemburge5034372000-08-08 08:04:29 +00006870 len1--; len2--;
6871 }
6872
6873 return (len1 < len2) ? -1 : (len1 != len2);
6874}
6875
6876#endif
6877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006881 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6882 return unicode_compare((PyUnicodeObject *)left,
6883 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006884 PyErr_Format(PyExc_TypeError,
6885 "Can't compare %.100s and %.100s",
6886 left->ob_type->tp_name,
6887 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 return -1;
6889}
6890
Martin v. Löwis5b222132007-06-10 09:51:05 +00006891int
6892PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6893{
6894 int i;
6895 Py_UNICODE *id;
6896 assert(PyUnicode_Check(uni));
6897 id = PyUnicode_AS_UNICODE(uni);
6898 /* Compare Unicode string and source character set string */
6899 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 if (id[i] != str[i])
6901 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006902 /* This check keeps Python strings that end in '\0' from comparing equal
6903 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006904 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006906 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006908 return 0;
6909}
6910
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006911
Benjamin Peterson29060642009-01-31 22:14:21 +00006912#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006913 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006914
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006915PyObject *PyUnicode_RichCompare(PyObject *left,
6916 PyObject *right,
6917 int op)
6918{
6919 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006921 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6922 PyObject *v;
6923 if (((PyUnicodeObject *) left)->length !=
6924 ((PyUnicodeObject *) right)->length) {
6925 if (op == Py_EQ) {
6926 Py_INCREF(Py_False);
6927 return Py_False;
6928 }
6929 if (op == Py_NE) {
6930 Py_INCREF(Py_True);
6931 return Py_True;
6932 }
6933 }
6934 if (left == right)
6935 result = 0;
6936 else
6937 result = unicode_compare((PyUnicodeObject *)left,
6938 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006940 /* Convert the return value to a Boolean */
6941 switch (op) {
6942 case Py_EQ:
6943 v = TEST_COND(result == 0);
6944 break;
6945 case Py_NE:
6946 v = TEST_COND(result != 0);
6947 break;
6948 case Py_LE:
6949 v = TEST_COND(result <= 0);
6950 break;
6951 case Py_GE:
6952 v = TEST_COND(result >= 0);
6953 break;
6954 case Py_LT:
6955 v = TEST_COND(result == -1);
6956 break;
6957 case Py_GT:
6958 v = TEST_COND(result == 1);
6959 break;
6960 default:
6961 PyErr_BadArgument();
6962 return NULL;
6963 }
6964 Py_INCREF(v);
6965 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006967
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006968 Py_INCREF(Py_NotImplemented);
6969 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006970}
6971
Guido van Rossum403d68b2000-03-13 15:55:09 +00006972int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006974{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006975 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006976 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006977
6978 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006979 sub = PyUnicode_FromObject(element);
6980 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 PyErr_Format(PyExc_TypeError,
6982 "'in <string>' requires string as left operand, not %s",
6983 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006984 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006985 }
6986
Thomas Wouters477c8d52006-05-27 19:21:47 +00006987 str = PyUnicode_FromObject(container);
6988 if (!str) {
6989 Py_DECREF(sub);
6990 return -1;
6991 }
6992
6993 result = stringlib_contains_obj(str, sub);
6994
6995 Py_DECREF(str);
6996 Py_DECREF(sub);
6997
Guido van Rossum403d68b2000-03-13 15:55:09 +00006998 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006999}
7000
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001/* Concat to string or Unicode object giving a new Unicode object. */
7002
7003PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
7006 PyUnicodeObject *u = NULL, *v = NULL, *w;
7007
7008 /* Coerce the two arguments */
7009 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7010 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7013 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016 /* Shortcuts */
7017 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 Py_DECREF(v);
7019 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 }
7021 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 Py_DECREF(u);
7023 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 }
7025
7026 /* Concat the two Unicode strings */
7027 w = _PyUnicode_New(u->length + v->length);
7028 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 Py_UNICODE_COPY(w->str, u->str, u->length);
7031 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7032
7033 Py_DECREF(u);
7034 Py_DECREF(v);
7035 return (PyObject *)w;
7036
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 Py_XDECREF(u);
7039 Py_XDECREF(v);
7040 return NULL;
7041}
7042
Walter Dörwald1ab83302007-05-18 17:15:44 +00007043void
7044PyUnicode_Append(PyObject **pleft, PyObject *right)
7045{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007046 PyObject *new;
7047 if (*pleft == NULL)
7048 return;
7049 if (right == NULL || !PyUnicode_Check(*pleft)) {
7050 Py_DECREF(*pleft);
7051 *pleft = NULL;
7052 return;
7053 }
7054 new = PyUnicode_Concat(*pleft, right);
7055 Py_DECREF(*pleft);
7056 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007057}
7058
7059void
7060PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7061{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007062 PyUnicode_Append(pleft, right);
7063 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007069Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007070string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
7073static PyObject *
7074unicode_count(PyUnicodeObject *self, PyObject *args)
7075{
7076 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007077 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007078 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 PyObject *result;
7080
Guido van Rossumb8872e62000-05-09 14:14:27 +00007081 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 return NULL;
7084
7085 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007086 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007089
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007090 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007091 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007092 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007093 substring->str, substring->length,
7094 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007095 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096
7097 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007098
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return result;
7100}
7101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007105Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007106to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007107handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7109'xmlcharrefreplace' as well as any other name registered with\n\
7110codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007113unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007115 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 char *encoding = NULL;
7117 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007118 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007119
Benjamin Peterson308d6372009-09-18 21:42:35 +00007120 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7121 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007123 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007124 if (v == NULL)
7125 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007126 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007127 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007128 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007129 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007130 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007131 Py_DECREF(v);
7132 return NULL;
7133 }
7134 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007135
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007137 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007138}
7139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142\n\
7143Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject*
7147unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7148{
7149 Py_UNICODE *e;
7150 Py_UNICODE *p;
7151 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007152 Py_UNICODE *qe;
7153 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 PyUnicodeObject *u;
7155 int tabsize = 8;
7156
7157 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
Thomas Wouters7e474022000-07-16 12:04:32 +00007160 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007161 i = 0; /* chars up to and including most recent \n or \r */
7162 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7163 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 for (p = self->str; p < e; p++)
7165 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 if (tabsize > 0) {
7167 incr = tabsize - (j % tabsize); /* cannot overflow */
7168 if (j > PY_SSIZE_T_MAX - incr)
7169 goto overflow1;
7170 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007171 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 if (j > PY_SSIZE_T_MAX - 1)
7175 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 j++;
7177 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 if (i > PY_SSIZE_T_MAX - j)
7179 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007181 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 }
7183 }
7184
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007185 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007187
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 /* Second pass: create output string and fill it */
7189 u = _PyUnicode_New(i + j);
7190 if (!u)
7191 return NULL;
7192
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007193 j = 0; /* same as in first pass */
7194 q = u->str; /* next output char */
7195 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197 for (p = self->str; p < e; p++)
7198 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 if (tabsize > 0) {
7200 i = tabsize - (j % tabsize);
7201 j += i;
7202 while (i--) {
7203 if (q >= qe)
7204 goto overflow2;
7205 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007208 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 else {
7210 if (q >= qe)
7211 goto overflow2;
7212 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007213 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (*p == '\n' || *p == '\r')
7215 j = 0;
7216 }
7217
7218 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007219
7220 overflow2:
7221 Py_DECREF(u);
7222 overflow1:
7223 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225}
7226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007227PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229\n\
7230Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007231such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232arguments start and end are interpreted as in slice notation.\n\
7233\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236static PyObject *
7237unicode_find(PyUnicodeObject *self, PyObject *args)
7238{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007239 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007240 Py_ssize_t start;
7241 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007242 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
Christian Heimes9cd17752007-11-18 19:35:23 +00007244 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
Thomas Wouters477c8d52006-05-27 19:21:47 +00007247 result = stringlib_find_slice(
7248 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7249 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7250 start, end
7251 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007254
Christian Heimes217cfd12007-12-02 14:31:20 +00007255 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256}
7257
7258static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260{
7261 if (index < 0 || index >= self->length) {
7262 PyErr_SetString(PyExc_IndexError, "string index out of range");
7263 return NULL;
7264 }
7265
7266 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7267}
7268
Guido van Rossumc2504932007-09-18 19:42:40 +00007269/* Believe it or not, this produces the same value for ASCII strings
7270 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007272unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273{
Guido van Rossumc2504932007-09-18 19:42:40 +00007274 Py_ssize_t len;
7275 Py_UNICODE *p;
7276 long x;
7277
7278 if (self->hash != -1)
7279 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007280 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007281 p = self->str;
7282 x = *p << 7;
7283 while (--len >= 0)
7284 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007285 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007286 if (x == -1)
7287 x = -2;
7288 self->hash = x;
7289 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007295Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
7297static PyObject *
7298unicode_index(PyUnicodeObject *self, PyObject *args)
7299{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007300 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007301 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007302 Py_ssize_t start;
7303 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
Christian Heimes9cd17752007-11-18 19:35:23 +00007305 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307
Thomas Wouters477c8d52006-05-27 19:21:47 +00007308 result = stringlib_find_slice(
7309 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7310 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7311 start, end
7312 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007315
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 if (result < 0) {
7317 PyErr_SetString(PyExc_ValueError, "substring not found");
7318 return NULL;
7319 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007320
Christian Heimes217cfd12007-12-02 14:31:20 +00007321 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322}
7323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007324PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007327Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007328at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329
7330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007331unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332{
7333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7334 register const Py_UNICODE *e;
7335 int cased;
7336
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 /* Shortcut for single character strings */
7338 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007341 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007342 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007344
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 e = p + PyUnicode_GET_SIZE(self);
7346 cased = 0;
7347 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007349
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7351 return PyBool_FromLong(0);
7352 else if (!cased && Py_UNICODE_ISLOWER(ch))
7353 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007355 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356}
7357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007358PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007361Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363
7364static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007365unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366{
7367 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7368 register const Py_UNICODE *e;
7369 int cased;
7370
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 /* Shortcut for single character strings */
7372 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007375 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007376 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007378
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 e = p + PyUnicode_GET_SIZE(self);
7380 cased = 0;
7381 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007383
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7385 return PyBool_FromLong(0);
7386 else if (!cased && Py_UNICODE_ISUPPER(ch))
7387 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007389 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007395Return True if S is a titlecased string and there is at least one\n\
7396character in S, i.e. upper- and titlecase characters may only\n\
7397follow uncased characters and lowercase characters only cased ones.\n\
7398Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
7400static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007401unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402{
7403 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7404 register const Py_UNICODE *e;
7405 int cased, previous_is_cased;
7406
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 /* Shortcut for single character strings */
7408 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7410 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007412 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007413 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 e = p + PyUnicode_GET_SIZE(self);
7417 cased = 0;
7418 previous_is_cased = 0;
7419 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007421
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7423 if (previous_is_cased)
7424 return PyBool_FromLong(0);
7425 previous_is_cased = 1;
7426 cased = 1;
7427 }
7428 else if (Py_UNICODE_ISLOWER(ch)) {
7429 if (!previous_is_cased)
7430 return PyBool_FromLong(0);
7431 previous_is_cased = 1;
7432 cased = 1;
7433 }
7434 else
7435 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007437 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438}
7439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007440PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007443Return True if all characters in S are whitespace\n\
7444and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445
7446static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007447unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448{
7449 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7450 register const Py_UNICODE *e;
7451
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 /* Shortcut for single character strings */
7453 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 Py_UNICODE_ISSPACE(*p))
7455 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007457 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007458 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007460
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 e = p + PyUnicode_GET_SIZE(self);
7462 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 if (!Py_UNICODE_ISSPACE(*p))
7464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467}
7468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007469PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007471\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007472Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007474
7475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007476unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007477{
7478 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7479 register const Py_UNICODE *e;
7480
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007481 /* Shortcut for single character strings */
7482 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 Py_UNICODE_ISALPHA(*p))
7484 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007485
7486 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007487 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007489
7490 e = p + PyUnicode_GET_SIZE(self);
7491 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 if (!Py_UNICODE_ISALPHA(*p))
7493 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007495 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007496}
7497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007498PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007500\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007501Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007502and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007503
7504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007505unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007506{
7507 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7508 register const Py_UNICODE *e;
7509
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007510 /* Shortcut for single character strings */
7511 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 Py_UNICODE_ISALNUM(*p))
7513 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007514
7515 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007516 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007518
7519 e = p + PyUnicode_GET_SIZE(self);
7520 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 if (!Py_UNICODE_ISALNUM(*p))
7522 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007523 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007524 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007525}
7526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007530Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
7533static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007534unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535{
7536 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7537 register const Py_UNICODE *e;
7538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 /* Shortcut for single character strings */
7540 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 Py_UNICODE_ISDECIMAL(*p))
7542 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007544 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007545 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007547
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 e = p + PyUnicode_GET_SIZE(self);
7549 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 if (!Py_UNICODE_ISDECIMAL(*p))
7551 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007553 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554}
7555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007556PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007559Return True if all characters in S are digits\n\
7560and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007563unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
7565 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7566 register const Py_UNICODE *e;
7567
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 /* Shortcut for single character strings */
7569 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_UNICODE_ISDIGIT(*p))
7571 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007573 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007574 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007576
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 e = p + PyUnicode_GET_SIZE(self);
7578 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 if (!Py_UNICODE_ISDIGIT(*p))
7580 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007582 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007588Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007589False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007592unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593{
7594 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7595 register const Py_UNICODE *e;
7596
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 /* Shortcut for single character strings */
7598 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 Py_UNICODE_ISNUMERIC(*p))
7600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007602 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007603 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007605
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 e = p + PyUnicode_GET_SIZE(self);
7607 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 if (!Py_UNICODE_ISNUMERIC(*p))
7609 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
Martin v. Löwis47383402007-08-15 07:32:56 +00007614int
7615PyUnicode_IsIdentifier(PyObject *self)
7616{
7617 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7618 register const Py_UNICODE *e;
7619
7620 /* Special case for empty strings */
7621 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007623
7624 /* PEP 3131 says that the first character must be in
7625 XID_Start and subsequent characters in XID_Continue,
7626 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007627 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007628 letters, digits, underscore). However, given the current
7629 definition of XID_Start and XID_Continue, it is sufficient
7630 to check just for these, except that _ must be allowed
7631 as starting an identifier. */
7632 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7633 return 0;
7634
7635 e = p + PyUnicode_GET_SIZE(self);
7636 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 if (!_PyUnicode_IsXidContinue(*p))
7638 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007639 }
7640 return 1;
7641}
7642
7643PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007645\n\
7646Return True if S is a valid identifier according\n\
7647to the language definition.");
7648
7649static PyObject*
7650unicode_isidentifier(PyObject *self)
7651{
7652 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7653}
7654
Georg Brandl559e5d72008-06-11 18:37:52 +00007655PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007657\n\
7658Return True if all characters in S are considered\n\
7659printable in repr() or S is empty, False otherwise.");
7660
7661static PyObject*
7662unicode_isprintable(PyObject *self)
7663{
7664 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7665 register const Py_UNICODE *e;
7666
7667 /* Shortcut for single character strings */
7668 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7669 Py_RETURN_TRUE;
7670 }
7671
7672 e = p + PyUnicode_GET_SIZE(self);
7673 for (; p < e; p++) {
7674 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7675 Py_RETURN_FALSE;
7676 }
7677 }
7678 Py_RETURN_TRUE;
7679}
7680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007681PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007682 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683\n\
7684Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007685iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
7687static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007688unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007690 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691}
7692
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694unicode_length(PyUnicodeObject *self)
7695{
7696 return self->length;
7697}
7698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007702Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007703done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704
7705static PyObject *
7706unicode_ljust(PyUnicodeObject *self, PyObject *args)
7707{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007708 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007709 Py_UNICODE fillchar = ' ';
7710
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007711 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 return NULL;
7713
Tim Peters7a29bd52001-09-12 03:03:31 +00007714 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 Py_INCREF(self);
7716 return (PyObject*) self;
7717 }
7718
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007719 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720}
7721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007722PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
7727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007728unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 return fixup(self, fixlower);
7731}
7732
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007733#define LEFTSTRIP 0
7734#define RIGHTSTRIP 1
7735#define BOTHSTRIP 2
7736
7737/* Arrays indexed by above */
7738static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7739
7740#define STRIPNAME(i) (stripformat[i]+3)
7741
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007742/* externally visible for str.strip(unicode) */
7743PyObject *
7744_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7747 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7748 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7749 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7750 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007753
Benjamin Peterson14339b62009-01-31 16:36:08 +00007754 i = 0;
7755 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7757 i++;
7758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007760
Benjamin Peterson14339b62009-01-31 16:36:08 +00007761 j = len;
7762 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 do {
7764 j--;
7765 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7766 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007767 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007768
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 Py_INCREF(self);
7771 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 }
7773 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775}
7776
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
7778static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007779do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7782 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 i = 0;
7785 if (striptype != RIGHTSTRIP) {
7786 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7787 i++;
7788 }
7789 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007790
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 j = len;
7792 if (striptype != LEFTSTRIP) {
7793 do {
7794 j--;
7795 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7796 j++;
7797 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007798
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7800 Py_INCREF(self);
7801 return (PyObject*)self;
7802 }
7803 else
7804 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805}
7806
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007807
7808static PyObject *
7809do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7810{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007811 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007812
Benjamin Peterson14339b62009-01-31 16:36:08 +00007813 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7814 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007815
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816 if (sep != NULL && sep != Py_None) {
7817 if (PyUnicode_Check(sep))
7818 return _PyUnicode_XStrip(self, striptype, sep);
7819 else {
7820 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 "%s arg must be None or str",
7822 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 return NULL;
7824 }
7825 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007826
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007828}
7829
7830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007831PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833\n\
7834Return a copy of the string S with leading and trailing\n\
7835whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007836If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007837
7838static PyObject *
7839unicode_strip(PyUnicodeObject *self, PyObject *args)
7840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 if (PyTuple_GET_SIZE(args) == 0)
7842 return do_strip(self, BOTHSTRIP); /* Common case */
7843 else
7844 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007845}
7846
7847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007848PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007850\n\
7851Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007852If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853
7854static PyObject *
7855unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 if (PyTuple_GET_SIZE(args) == 0)
7858 return do_strip(self, LEFTSTRIP); /* Common case */
7859 else
7860 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007861}
7862
7863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007864PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007866\n\
7867Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007868If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007869
7870static PyObject *
7871unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7872{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873 if (PyTuple_GET_SIZE(args) == 0)
7874 return do_strip(self, RIGHTSTRIP); /* Common case */
7875 else
7876 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007877}
7878
7879
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007881unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882{
7883 PyUnicodeObject *u;
7884 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007885 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007886 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887
Georg Brandl222de0f2009-04-12 12:01:50 +00007888 if (len < 1) {
7889 Py_INCREF(unicode_empty);
7890 return (PyObject *)unicode_empty;
7891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Tim Peters7a29bd52001-09-12 03:03:31 +00007893 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 /* no repeat, return original string */
7895 Py_INCREF(str);
7896 return (PyObject*) str;
7897 }
Tim Peters8f422462000-09-09 06:13:41 +00007898
7899 /* ensure # of chars needed doesn't overflow int and # of bytes
7900 * needed doesn't overflow size_t
7901 */
7902 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007903 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007904 PyErr_SetString(PyExc_OverflowError,
7905 "repeated string is too long");
7906 return NULL;
7907 }
7908 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7909 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7910 PyErr_SetString(PyExc_OverflowError,
7911 "repeated string is too long");
7912 return NULL;
7913 }
7914 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915 if (!u)
7916 return NULL;
7917
7918 p = u->str;
7919
Georg Brandl222de0f2009-04-12 12:01:50 +00007920 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007921 Py_UNICODE_FILL(p, str->str[0], len);
7922 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007923 Py_ssize_t done = str->length; /* number of characters copied this far */
7924 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007926 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007927 Py_UNICODE_COPY(p+done, p, n);
7928 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
7931
7932 return (PyObject*) u;
7933}
7934
7935PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 PyObject *subobj,
7937 PyObject *replobj,
7938 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939{
7940 PyObject *self;
7941 PyObject *str1;
7942 PyObject *str2;
7943 PyObject *result;
7944
7945 self = PyUnicode_FromObject(obj);
7946 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 str1 = PyUnicode_FromObject(subobj);
7949 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 Py_DECREF(self);
7951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 }
7953 str2 = PyUnicode_FromObject(replobj);
7954 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 Py_DECREF(self);
7956 Py_DECREF(str1);
7957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 }
Tim Petersced69f82003-09-16 20:30:58 +00007959 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 (PyUnicodeObject *)str1,
7961 (PyUnicodeObject *)str2,
7962 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 Py_DECREF(self);
7964 Py_DECREF(str1);
7965 Py_DECREF(str2);
7966 return result;
7967}
7968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007969PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007970 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971\n\
7972Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007973old replaced by new. If the optional argument count is\n\
7974given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
7976static PyObject*
7977unicode_replace(PyUnicodeObject *self, PyObject *args)
7978{
7979 PyUnicodeObject *str1;
7980 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 PyObject *result;
7983
Martin v. Löwis18e16552006-02-15 17:27:45 +00007984 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 return NULL;
7986 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7987 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007990 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_DECREF(str1);
7992 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
7995 result = replace(self, str1, str2, maxcount);
7996
7997 Py_DECREF(str1);
7998 Py_DECREF(str2);
7999 return result;
8000}
8001
8002static
8003PyObject *unicode_repr(PyObject *unicode)
8004{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008005 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008006 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008007 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8008 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8009
8010 /* XXX(nnorwitz): rather than over-allocating, it would be
8011 better to choose a different scheme. Perhaps scan the
8012 first N-chars of the string and allocate based on that size.
8013 */
8014 /* Initial allocation is based on the longest-possible unichr
8015 escape.
8016
8017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8018 unichr, so in this case it's the longest unichr escape. In
8019 narrow (UTF-16) builds this is five chars per source unichr
8020 since there are two unichrs in the surrogate pair, so in narrow
8021 (UTF-16) builds it's not the longest unichr escape.
8022
8023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8024 so in the narrow (UTF-16) build case it's the longest unichr
8025 escape.
8026 */
8027
Walter Dörwald1ab83302007-05-18 17:15:44 +00008028 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008030#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008032#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008034#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008036 if (repr == NULL)
8037 return NULL;
8038
Walter Dörwald1ab83302007-05-18 17:15:44 +00008039 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008040
8041 /* Add quote */
8042 *p++ = (findchar(s, size, '\'') &&
8043 !findchar(s, size, '"')) ? '"' : '\'';
8044 while (size-- > 0) {
8045 Py_UNICODE ch = *s++;
8046
8047 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008048 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008049 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008050 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008051 continue;
8052 }
8053
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008055 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008056 *p++ = '\\';
8057 *p++ = 't';
8058 }
8059 else if (ch == '\n') {
8060 *p++ = '\\';
8061 *p++ = 'n';
8062 }
8063 else if (ch == '\r') {
8064 *p++ = '\\';
8065 *p++ = 'r';
8066 }
8067
8068 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008069 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008070 *p++ = '\\';
8071 *p++ = 'x';
8072 *p++ = hexdigits[(ch >> 4) & 0x000F];
8073 *p++ = hexdigits[ch & 0x000F];
8074 }
8075
Georg Brandl559e5d72008-06-11 18:37:52 +00008076 /* Copy ASCII characters as-is */
8077 else if (ch < 0x7F) {
8078 *p++ = ch;
8079 }
8080
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008082 else {
8083 Py_UCS4 ucs = ch;
8084
8085#ifndef Py_UNICODE_WIDE
8086 Py_UNICODE ch2 = 0;
8087 /* Get code point from surrogate pair */
8088 if (size > 0) {
8089 ch2 = *s;
8090 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008095 size--;
8096 }
8097 }
8098#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008100 (categories Z* and C* except ASCII space)
8101 */
8102 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8103 /* Map 8-bit characters to '\xhh' */
8104 if (ucs <= 0xff) {
8105 *p++ = '\\';
8106 *p++ = 'x';
8107 *p++ = hexdigits[(ch >> 4) & 0x000F];
8108 *p++ = hexdigits[ch & 0x000F];
8109 }
8110 /* Map 21-bit characters to '\U00xxxxxx' */
8111 else if (ucs >= 0x10000) {
8112 *p++ = '\\';
8113 *p++ = 'U';
8114 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8115 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8116 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8117 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8118 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8119 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8120 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8121 *p++ = hexdigits[ucs & 0x0000000F];
8122 }
8123 /* Map 16-bit characters to '\uxxxx' */
8124 else {
8125 *p++ = '\\';
8126 *p++ = 'u';
8127 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8128 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8129 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8130 *p++ = hexdigits[ucs & 0x000F];
8131 }
8132 }
8133 /* Copy characters as-is */
8134 else {
8135 *p++ = ch;
8136#ifndef Py_UNICODE_WIDE
8137 if (ucs >= 0x10000)
8138 *p++ = ch2;
8139#endif
8140 }
8141 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008142 }
8143 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008144 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008145
8146 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008147 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008148 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149}
8150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008151PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153\n\
8154Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008155such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156arguments start and end are interpreted as in slice notation.\n\
8157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008158Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
8160static PyObject *
8161unicode_rfind(PyUnicodeObject *self, PyObject *args)
8162{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008163 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008164 Py_ssize_t start;
8165 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167
Christian Heimes9cd17752007-11-18 19:35:23 +00008168 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170
Thomas Wouters477c8d52006-05-27 19:21:47 +00008171 result = stringlib_rfind_slice(
8172 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8173 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8174 start, end
8175 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176
8177 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008178
Christian Heimes217cfd12007-12-02 14:31:20 +00008179 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180}
8181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008182PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008185Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186
8187static PyObject *
8188unicode_rindex(PyUnicodeObject *self, PyObject *args)
8189{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008190 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008191 Py_ssize_t start;
8192 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008193 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
Christian Heimes9cd17752007-11-18 19:35:23 +00008195 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198 result = stringlib_rfind_slice(
8199 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8200 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8201 start, end
8202 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203
8204 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008205
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (result < 0) {
8207 PyErr_SetString(PyExc_ValueError, "substring not found");
8208 return NULL;
8209 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211}
8212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008213PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008216Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008217done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218
8219static PyObject *
8220unicode_rjust(PyUnicodeObject *self, PyObject *args)
8221{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008222 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008223 Py_UNICODE fillchar = ' ';
8224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008225 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 return NULL;
8227
Tim Peters7a29bd52001-09-12 03:03:31 +00008228 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 Py_INCREF(self);
8230 return (PyObject*) self;
8231 }
8232
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008233 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
8235
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 PyObject *sep,
8238 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
8240 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 s = PyUnicode_FromObject(s);
8243 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 if (sep != NULL) {
8246 sep = PyUnicode_FromObject(sep);
8247 if (sep == NULL) {
8248 Py_DECREF(s);
8249 return NULL;
8250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 }
8252
8253 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8254
8255 Py_DECREF(s);
8256 Py_XDECREF(sep);
8257 return result;
8258}
8259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008260PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262\n\
8263Return a list of the words in S, using sep as the\n\
8264delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008265splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008266whitespace string is a separator and empty strings are\n\
8267removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268
8269static PyObject*
8270unicode_split(PyUnicodeObject *self, PyObject *args)
8271{
8272 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008273 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274
Martin v. Löwis18e16552006-02-15 17:27:45 +00008275 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 return NULL;
8277
8278 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284}
8285
Thomas Wouters477c8d52006-05-27 19:21:47 +00008286PyObject *
8287PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8288{
8289 PyObject* str_obj;
8290 PyObject* sep_obj;
8291 PyObject* out;
8292
8293 str_obj = PyUnicode_FromObject(str_in);
8294 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008296 sep_obj = PyUnicode_FromObject(sep_in);
8297 if (!sep_obj) {
8298 Py_DECREF(str_obj);
8299 return NULL;
8300 }
8301
8302 out = stringlib_partition(
8303 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8304 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8305 );
8306
8307 Py_DECREF(sep_obj);
8308 Py_DECREF(str_obj);
8309
8310 return out;
8311}
8312
8313
8314PyObject *
8315PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8316{
8317 PyObject* str_obj;
8318 PyObject* sep_obj;
8319 PyObject* out;
8320
8321 str_obj = PyUnicode_FromObject(str_in);
8322 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324 sep_obj = PyUnicode_FromObject(sep_in);
8325 if (!sep_obj) {
8326 Py_DECREF(str_obj);
8327 return NULL;
8328 }
8329
8330 out = stringlib_rpartition(
8331 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8332 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8333 );
8334
8335 Py_DECREF(sep_obj);
8336 Py_DECREF(str_obj);
8337
8338 return out;
8339}
8340
8341PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008343\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008344Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008345the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008346found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347
8348static PyObject*
8349unicode_partition(PyUnicodeObject *self, PyObject *separator)
8350{
8351 return PyUnicode_Partition((PyObject *)self, separator);
8352}
8353
8354PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008355 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008356\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008357Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008358the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008359separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008360
8361static PyObject*
8362unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8363{
8364 return PyUnicode_RPartition((PyObject *)self, separator);
8365}
8366
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008367PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 PyObject *sep,
8369 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008370{
8371 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008372
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008373 s = PyUnicode_FromObject(s);
8374 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008375 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 if (sep != NULL) {
8377 sep = PyUnicode_FromObject(sep);
8378 if (sep == NULL) {
8379 Py_DECREF(s);
8380 return NULL;
8381 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008382 }
8383
8384 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8385
8386 Py_DECREF(s);
8387 Py_XDECREF(sep);
8388 return result;
8389}
8390
8391PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008393\n\
8394Return a list of the words in S, using sep as the\n\
8395delimiter string, starting at the end of the string and\n\
8396working to the front. If maxsplit is given, at most maxsplit\n\
8397splits are done. If sep is not specified, any whitespace string\n\
8398is a separator.");
8399
8400static PyObject*
8401unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8402{
8403 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008405
Martin v. Löwis18e16552006-02-15 17:27:45 +00008406 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008407 return NULL;
8408
8409 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008411 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008413 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008415}
8416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008417PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419\n\
8420Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008421Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008422is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423
8424static PyObject*
8425unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8426{
Guido van Rossum86662912000-04-11 15:38:46 +00008427 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428
Guido van Rossum86662912000-04-11 15:38:46 +00008429 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 return NULL;
8431
Guido van Rossum86662912000-04-11 15:38:46 +00008432 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433}
8434
8435static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008436PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437{
Walter Dörwald346737f2007-05-31 10:44:43 +00008438 if (PyUnicode_CheckExact(self)) {
8439 Py_INCREF(self);
8440 return self;
8441 } else
8442 /* Subtype -- return genuine unicode string with the same value. */
8443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8444 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445}
8446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008447PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449\n\
8450Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008451and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452
8453static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008454unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 return fixup(self, fixswapcase);
8457}
8458
Georg Brandlceee0772007-11-27 23:48:05 +00008459PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008461\n\
8462Return a translation table usable for str.translate().\n\
8463If there is only one argument, it must be a dictionary mapping Unicode\n\
8464ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008465Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008466If there are two arguments, they must be strings of equal length, and\n\
8467in the resulting dictionary, each character in x will be mapped to the\n\
8468character at the same position in y. If there is a third argument, it\n\
8469must be a string, whose characters will be mapped to None in the result.");
8470
8471static PyObject*
8472unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8473{
8474 PyObject *x, *y = NULL, *z = NULL;
8475 PyObject *new = NULL, *key, *value;
8476 Py_ssize_t i = 0;
8477 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478
Georg Brandlceee0772007-11-27 23:48:05 +00008479 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8480 return NULL;
8481 new = PyDict_New();
8482 if (!new)
8483 return NULL;
8484 if (y != NULL) {
8485 /* x must be a string too, of equal length */
8486 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8487 if (!PyUnicode_Check(x)) {
8488 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8489 "be a string if there is a second argument");
8490 goto err;
8491 }
8492 if (PyUnicode_GET_SIZE(x) != ylen) {
8493 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8494 "arguments must have equal length");
8495 goto err;
8496 }
8497 /* create entries for translating chars in x to those in y */
8498 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008499 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8500 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008501 if (!key || !value)
8502 goto err;
8503 res = PyDict_SetItem(new, key, value);
8504 Py_DECREF(key);
8505 Py_DECREF(value);
8506 if (res < 0)
8507 goto err;
8508 }
8509 /* create entries for deleting chars in z */
8510 if (z != NULL) {
8511 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008512 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008513 if (!key)
8514 goto err;
8515 res = PyDict_SetItem(new, key, Py_None);
8516 Py_DECREF(key);
8517 if (res < 0)
8518 goto err;
8519 }
8520 }
8521 } else {
8522 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008523 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008524 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8525 "to maketrans it must be a dict");
8526 goto err;
8527 }
8528 /* copy entries into the new dict, converting string keys to int keys */
8529 while (PyDict_Next(x, &i, &key, &value)) {
8530 if (PyUnicode_Check(key)) {
8531 /* convert string keys to integer keys */
8532 PyObject *newkey;
8533 if (PyUnicode_GET_SIZE(key) != 1) {
8534 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8535 "table must be of length 1");
8536 goto err;
8537 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008538 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008539 if (!newkey)
8540 goto err;
8541 res = PyDict_SetItem(new, newkey, value);
8542 Py_DECREF(newkey);
8543 if (res < 0)
8544 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008545 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008546 /* just keep integer keys */
8547 if (PyDict_SetItem(new, key, value) < 0)
8548 goto err;
8549 } else {
8550 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8551 "be strings or integers");
8552 goto err;
8553 }
8554 }
8555 }
8556 return new;
8557 err:
8558 Py_DECREF(new);
8559 return NULL;
8560}
8561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008562PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564\n\
8565Return a copy of the string S, where all characters have been mapped\n\
8566through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008567Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008568Unmapped characters are left untouched. Characters mapped to None\n\
8569are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570
8571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008572unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573{
Georg Brandlceee0772007-11-27 23:48:05 +00008574 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575}
8576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008577PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008580Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
8582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008583unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 return fixup(self, fixupper);
8586}
8587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008588PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008591Pad a numeric string S with zeros on the left, to fill a field\n\
8592of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
8594static PyObject *
8595unicode_zfill(PyUnicodeObject *self, PyObject *args)
8596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008597 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 PyUnicodeObject *u;
8599
Martin v. Löwis18e16552006-02-15 17:27:45 +00008600 Py_ssize_t width;
8601 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 return NULL;
8603
8604 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008605 if (PyUnicode_CheckExact(self)) {
8606 Py_INCREF(self);
8607 return (PyObject*) self;
8608 }
8609 else
8610 return PyUnicode_FromUnicode(
8611 PyUnicode_AS_UNICODE(self),
8612 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 }
8615
8616 fill = width - self->length;
8617
8618 u = pad(self, fill, 0, '0');
8619
Walter Dörwald068325e2002-04-15 13:36:47 +00008620 if (u == NULL)
8621 return NULL;
8622
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 if (u->str[fill] == '+' || u->str[fill] == '-') {
8624 /* move sign to beginning of string */
8625 u->str[0] = u->str[fill];
8626 u->str[fill] = '0';
8627 }
8628
8629 return (PyObject*) u;
8630}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631
8632#if 0
8633static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008634unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635{
Christian Heimes2202f872008-02-06 14:31:34 +00008636 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637}
8638#endif
8639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008640PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008643Return True if S starts with the specified prefix, False otherwise.\n\
8644With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008645With optional end, stop comparing S at that position.\n\
8646prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647
8648static PyObject *
8649unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008652 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008654 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008655 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008656 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008658 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8660 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008661 if (PyTuple_Check(subobj)) {
8662 Py_ssize_t i;
8663 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8664 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008666 if (substring == NULL)
8667 return NULL;
8668 result = tailmatch(self, substring, start, end, -1);
8669 Py_DECREF(substring);
8670 if (result) {
8671 Py_RETURN_TRUE;
8672 }
8673 }
8674 /* nothing matched */
8675 Py_RETURN_FALSE;
8676 }
8677 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008680 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008682 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683}
8684
8685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008686PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008689Return True if S ends with the specified suffix, False otherwise.\n\
8690With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008691With optional end, stop comparing S at that position.\n\
8692suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
8694static PyObject *
8695unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008698 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008701 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008702 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008704 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8706 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008707 if (PyTuple_Check(subobj)) {
8708 Py_ssize_t i;
8709 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8710 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008712 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008714 result = tailmatch(self, substring, start, end, +1);
8715 Py_DECREF(substring);
8716 if (result) {
8717 Py_RETURN_TRUE;
8718 }
8719 }
8720 Py_RETURN_FALSE;
8721 }
8722 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008726 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008728 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729}
8730
Eric Smith8c663262007-08-25 02:26:07 +00008731#include "stringlib/string_format.h"
8732
8733PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008735\n\
8736");
8737
Eric Smith4a7d76d2008-05-30 18:10:19 +00008738static PyObject *
8739unicode__format__(PyObject* self, PyObject* args)
8740{
8741 PyObject *format_spec;
8742
8743 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8744 return NULL;
8745
8746 return _PyUnicode_FormatAdvanced(self,
8747 PyUnicode_AS_UNICODE(format_spec),
8748 PyUnicode_GET_SIZE(format_spec));
8749}
8750
Eric Smith8c663262007-08-25 02:26:07 +00008751PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008753\n\
8754");
8755
8756static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008757unicode__sizeof__(PyUnicodeObject *v)
8758{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008759 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8760 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008761}
8762
8763PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008765
8766static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008767unicode_getnewargs(PyUnicodeObject *v)
8768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008770}
8771
8772
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773static PyMethodDef unicode_methods[] = {
8774
8775 /* Order is according to common usage: often used methods should
8776 appear first, since lookup is done sequentially. */
8777
Benjamin Peterson308d6372009-09-18 21:42:35 +00008778 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008779 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8780 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008781 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008782 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8783 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8784 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8785 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8786 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8787 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8788 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008789 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008790 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8791 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8792 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008793 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008794 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8795 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8796 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008797 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008798 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008799 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008800 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008801 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8802 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8803 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8804 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8805 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8806 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8807 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8808 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8809 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8810 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8811 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8812 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8813 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8814 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008815 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008816 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008817 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008818 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008819 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008820 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8821 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008822 {"maketrans", (PyCFunction) unicode_maketrans,
8823 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008824 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008825#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008826 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827#endif
8828
8829#if 0
8830 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008831 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832#endif
8833
Benjamin Peterson14339b62009-01-31 16:36:08 +00008834 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 {NULL, NULL}
8836};
8837
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008838static PyObject *
8839unicode_mod(PyObject *v, PyObject *w)
8840{
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 if (!PyUnicode_Check(v)) {
8842 Py_INCREF(Py_NotImplemented);
8843 return Py_NotImplemented;
8844 }
8845 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008846}
8847
8848static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008849 0, /*nb_add*/
8850 0, /*nb_subtract*/
8851 0, /*nb_multiply*/
8852 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008853};
8854
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008856 (lenfunc) unicode_length, /* sq_length */
8857 PyUnicode_Concat, /* sq_concat */
8858 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8859 (ssizeargfunc) unicode_getitem, /* sq_item */
8860 0, /* sq_slice */
8861 0, /* sq_ass_item */
8862 0, /* sq_ass_slice */
8863 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864};
8865
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008866static PyObject*
8867unicode_subscript(PyUnicodeObject* self, PyObject* item)
8868{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008869 if (PyIndex_Check(item)) {
8870 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008871 if (i == -1 && PyErr_Occurred())
8872 return NULL;
8873 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008874 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008875 return unicode_getitem(self, i);
8876 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008878 Py_UNICODE* source_buf;
8879 Py_UNICODE* result_buf;
8880 PyObject* result;
8881
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008882 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008884 return NULL;
8885 }
8886
8887 if (slicelength <= 0) {
8888 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008889 } else if (start == 0 && step == 1 && slicelength == self->length &&
8890 PyUnicode_CheckExact(self)) {
8891 Py_INCREF(self);
8892 return (PyObject *)self;
8893 } else if (step == 1) {
8894 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008895 } else {
8896 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008897 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8898 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 if (result_buf == NULL)
8901 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008902
8903 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8904 result_buf[i] = source_buf[cur];
8905 }
Tim Petersced69f82003-09-16 20:30:58 +00008906
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008907 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008908 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008909 return result;
8910 }
8911 } else {
8912 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8913 return NULL;
8914 }
8915}
8916
8917static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 (lenfunc)unicode_length, /* mp_length */
8919 (binaryfunc)unicode_subscript, /* mp_subscript */
8920 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008921};
8922
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924/* Helpers for PyUnicode_Format() */
8925
8926static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008927getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008929 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 (*p_argidx)++;
8932 if (arglen < 0)
8933 return args;
8934 else
8935 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
8937 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 return NULL;
8940}
8941
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008942/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008944static PyObject *
8945formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008947 char *p;
8948 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008950
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 x = PyFloat_AsDouble(v);
8952 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008953 return NULL;
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008957
Eric Smith0923d1d2009-04-16 20:16:10 +00008958 p = PyOS_double_to_string(x, type, prec,
8959 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960 if (p == NULL)
8961 return NULL;
8962 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008963 PyMem_Free(p);
8964 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965}
8966
Tim Peters38fd5b62000-09-21 05:43:11 +00008967static PyObject*
8968formatlong(PyObject *val, int flags, int prec, int type)
8969{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008970 char *buf;
8971 int len;
8972 PyObject *str; /* temporary string object. */
8973 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008974
Benjamin Peterson14339b62009-01-31 16:36:08 +00008975 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8976 if (!str)
8977 return NULL;
8978 result = PyUnicode_FromStringAndSize(buf, len);
8979 Py_DECREF(str);
8980 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008981}
8982
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983static int
8984formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008985 size_t buflen,
8986 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008988 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008989 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 if (PyUnicode_GET_SIZE(v) == 1) {
8991 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8992 buf[1] = '\0';
8993 return 1;
8994 }
8995#ifndef Py_UNICODE_WIDE
8996 if (PyUnicode_GET_SIZE(v) == 2) {
8997 /* Decode a valid surrogate pair */
8998 int c0 = PyUnicode_AS_UNICODE(v)[0];
8999 int c1 = PyUnicode_AS_UNICODE(v)[1];
9000 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9001 0xDC00 <= c1 && c1 <= 0xDFFF) {
9002 buf[0] = c0;
9003 buf[1] = c1;
9004 buf[2] = '\0';
9005 return 2;
9006 }
9007 }
9008#endif
9009 goto onError;
9010 }
9011 else {
9012 /* Integer input truncated to a character */
9013 long x;
9014 x = PyLong_AsLong(v);
9015 if (x == -1 && PyErr_Occurred())
9016 goto onError;
9017
9018 if (x < 0 || x > 0x10ffff) {
9019 PyErr_SetString(PyExc_OverflowError,
9020 "%c arg not in range(0x110000)");
9021 return -1;
9022 }
9023
9024#ifndef Py_UNICODE_WIDE
9025 if (x > 0xffff) {
9026 x -= 0x10000;
9027 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9028 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9029 return 2;
9030 }
9031#endif
9032 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009033 buf[1] = '\0';
9034 return 1;
9035 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009036
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009038 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009040 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
9042
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009043/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009044 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009045*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009046#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009047
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050{
9051 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009052 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 int args_owned = 0;
9054 PyUnicodeObject *result = NULL;
9055 PyObject *dict = NULL;
9056 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009057
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 PyErr_BadInternalCall();
9060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061 }
9062 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009063 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 fmt = PyUnicode_AS_UNICODE(uformat);
9066 fmtcnt = PyUnicode_GET_SIZE(uformat);
9067
9068 reslen = rescnt = fmtcnt + 100;
9069 result = _PyUnicode_New(reslen);
9070 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 res = PyUnicode_AS_UNICODE(result);
9073
9074 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 arglen = PyTuple_Size(args);
9076 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 }
9078 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 arglen = -1;
9080 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009082 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009083 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085
9086 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 if (*fmt != '%') {
9088 if (--rescnt < 0) {
9089 rescnt = fmtcnt + 100;
9090 reslen += rescnt;
9091 if (_PyUnicode_Resize(&result, reslen) < 0)
9092 goto onError;
9093 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9094 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009097 }
9098 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 /* Got a format specifier */
9100 int flags = 0;
9101 Py_ssize_t width = -1;
9102 int prec = -1;
9103 Py_UNICODE c = '\0';
9104 Py_UNICODE fill;
9105 int isnumok;
9106 PyObject *v = NULL;
9107 PyObject *temp = NULL;
9108 Py_UNICODE *pbuf;
9109 Py_UNICODE sign;
9110 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009111 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 fmt++;
9114 if (*fmt == '(') {
9115 Py_UNICODE *keystart;
9116 Py_ssize_t keylen;
9117 PyObject *key;
9118 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009119
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 if (dict == NULL) {
9121 PyErr_SetString(PyExc_TypeError,
9122 "format requires a mapping");
9123 goto onError;
9124 }
9125 ++fmt;
9126 --fmtcnt;
9127 keystart = fmt;
9128 /* Skip over balanced parentheses */
9129 while (pcount > 0 && --fmtcnt >= 0) {
9130 if (*fmt == ')')
9131 --pcount;
9132 else if (*fmt == '(')
9133 ++pcount;
9134 fmt++;
9135 }
9136 keylen = fmt - keystart - 1;
9137 if (fmtcnt < 0 || pcount > 0) {
9138 PyErr_SetString(PyExc_ValueError,
9139 "incomplete format key");
9140 goto onError;
9141 }
9142#if 0
9143 /* keys are converted to strings using UTF-8 and
9144 then looked up since Python uses strings to hold
9145 variables names etc. in its namespaces and we
9146 wouldn't want to break common idioms. */
9147 key = PyUnicode_EncodeUTF8(keystart,
9148 keylen,
9149 NULL);
9150#else
9151 key = PyUnicode_FromUnicode(keystart, keylen);
9152#endif
9153 if (key == NULL)
9154 goto onError;
9155 if (args_owned) {
9156 Py_DECREF(args);
9157 args_owned = 0;
9158 }
9159 args = PyObject_GetItem(dict, key);
9160 Py_DECREF(key);
9161 if (args == NULL) {
9162 goto onError;
9163 }
9164 args_owned = 1;
9165 arglen = -1;
9166 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 while (--fmtcnt >= 0) {
9169 switch (c = *fmt++) {
9170 case '-': flags |= F_LJUST; continue;
9171 case '+': flags |= F_SIGN; continue;
9172 case ' ': flags |= F_BLANK; continue;
9173 case '#': flags |= F_ALT; continue;
9174 case '0': flags |= F_ZERO; continue;
9175 }
9176 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 if (c == '*') {
9179 v = getnextarg(args, arglen, &argidx);
9180 if (v == NULL)
9181 goto onError;
9182 if (!PyLong_Check(v)) {
9183 PyErr_SetString(PyExc_TypeError,
9184 "* wants int");
9185 goto onError;
9186 }
9187 width = PyLong_AsLong(v);
9188 if (width == -1 && PyErr_Occurred())
9189 goto onError;
9190 if (width < 0) {
9191 flags |= F_LJUST;
9192 width = -width;
9193 }
9194 if (--fmtcnt >= 0)
9195 c = *fmt++;
9196 }
9197 else if (c >= '0' && c <= '9') {
9198 width = c - '0';
9199 while (--fmtcnt >= 0) {
9200 c = *fmt++;
9201 if (c < '0' || c > '9')
9202 break;
9203 if ((width*10) / 10 != width) {
9204 PyErr_SetString(PyExc_ValueError,
9205 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009206 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 }
9208 width = width*10 + (c - '0');
9209 }
9210 }
9211 if (c == '.') {
9212 prec = 0;
9213 if (--fmtcnt >= 0)
9214 c = *fmt++;
9215 if (c == '*') {
9216 v = getnextarg(args, arglen, &argidx);
9217 if (v == NULL)
9218 goto onError;
9219 if (!PyLong_Check(v)) {
9220 PyErr_SetString(PyExc_TypeError,
9221 "* wants int");
9222 goto onError;
9223 }
9224 prec = PyLong_AsLong(v);
9225 if (prec == -1 && PyErr_Occurred())
9226 goto onError;
9227 if (prec < 0)
9228 prec = 0;
9229 if (--fmtcnt >= 0)
9230 c = *fmt++;
9231 }
9232 else if (c >= '0' && c <= '9') {
9233 prec = c - '0';
9234 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009235 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 if (c < '0' || c > '9')
9237 break;
9238 if ((prec*10) / 10 != prec) {
9239 PyErr_SetString(PyExc_ValueError,
9240 "prec too big");
9241 goto onError;
9242 }
9243 prec = prec*10 + (c - '0');
9244 }
9245 }
9246 } /* prec */
9247 if (fmtcnt >= 0) {
9248 if (c == 'h' || c == 'l' || c == 'L') {
9249 if (--fmtcnt >= 0)
9250 c = *fmt++;
9251 }
9252 }
9253 if (fmtcnt < 0) {
9254 PyErr_SetString(PyExc_ValueError,
9255 "incomplete format");
9256 goto onError;
9257 }
9258 if (c != '%') {
9259 v = getnextarg(args, arglen, &argidx);
9260 if (v == NULL)
9261 goto onError;
9262 }
9263 sign = 0;
9264 fill = ' ';
9265 switch (c) {
9266
9267 case '%':
9268 pbuf = formatbuf;
9269 /* presume that buffer length is at least 1 */
9270 pbuf[0] = '%';
9271 len = 1;
9272 break;
9273
9274 case 's':
9275 case 'r':
9276 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009277 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009278 temp = v;
9279 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009280 }
9281 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009282 if (c == 's')
9283 temp = PyObject_Str(v);
9284 else if (c == 'r')
9285 temp = PyObject_Repr(v);
9286 else
9287 temp = PyObject_ASCII(v);
9288 if (temp == NULL)
9289 goto onError;
9290 if (PyUnicode_Check(temp))
9291 /* nothing to do */;
9292 else {
9293 Py_DECREF(temp);
9294 PyErr_SetString(PyExc_TypeError,
9295 "%s argument has non-string str()");
9296 goto onError;
9297 }
9298 }
9299 pbuf = PyUnicode_AS_UNICODE(temp);
9300 len = PyUnicode_GET_SIZE(temp);
9301 if (prec >= 0 && len > prec)
9302 len = prec;
9303 break;
9304
9305 case 'i':
9306 case 'd':
9307 case 'u':
9308 case 'o':
9309 case 'x':
9310 case 'X':
9311 if (c == 'i')
9312 c = 'd';
9313 isnumok = 0;
9314 if (PyNumber_Check(v)) {
9315 PyObject *iobj=NULL;
9316
9317 if (PyLong_Check(v)) {
9318 iobj = v;
9319 Py_INCREF(iobj);
9320 }
9321 else {
9322 iobj = PyNumber_Long(v);
9323 }
9324 if (iobj!=NULL) {
9325 if (PyLong_Check(iobj)) {
9326 isnumok = 1;
9327 temp = formatlong(iobj, flags, prec, c);
9328 Py_DECREF(iobj);
9329 if (!temp)
9330 goto onError;
9331 pbuf = PyUnicode_AS_UNICODE(temp);
9332 len = PyUnicode_GET_SIZE(temp);
9333 sign = 1;
9334 }
9335 else {
9336 Py_DECREF(iobj);
9337 }
9338 }
9339 }
9340 if (!isnumok) {
9341 PyErr_Format(PyExc_TypeError,
9342 "%%%c format: a number is required, "
9343 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9344 goto onError;
9345 }
9346 if (flags & F_ZERO)
9347 fill = '0';
9348 break;
9349
9350 case 'e':
9351 case 'E':
9352 case 'f':
9353 case 'F':
9354 case 'g':
9355 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009356 temp = formatfloat(v, flags, prec, c);
9357 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009359 pbuf = PyUnicode_AS_UNICODE(temp);
9360 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 sign = 1;
9362 if (flags & F_ZERO)
9363 fill = '0';
9364 break;
9365
9366 case 'c':
9367 pbuf = formatbuf;
9368 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9369 if (len < 0)
9370 goto onError;
9371 break;
9372
9373 default:
9374 PyErr_Format(PyExc_ValueError,
9375 "unsupported format character '%c' (0x%x) "
9376 "at index %zd",
9377 (31<=c && c<=126) ? (char)c : '?',
9378 (int)c,
9379 (Py_ssize_t)(fmt - 1 -
9380 PyUnicode_AS_UNICODE(uformat)));
9381 goto onError;
9382 }
9383 if (sign) {
9384 if (*pbuf == '-' || *pbuf == '+') {
9385 sign = *pbuf++;
9386 len--;
9387 }
9388 else if (flags & F_SIGN)
9389 sign = '+';
9390 else if (flags & F_BLANK)
9391 sign = ' ';
9392 else
9393 sign = 0;
9394 }
9395 if (width < len)
9396 width = len;
9397 if (rescnt - (sign != 0) < width) {
9398 reslen -= rescnt;
9399 rescnt = width + fmtcnt + 100;
9400 reslen += rescnt;
9401 if (reslen < 0) {
9402 Py_XDECREF(temp);
9403 PyErr_NoMemory();
9404 goto onError;
9405 }
9406 if (_PyUnicode_Resize(&result, reslen) < 0) {
9407 Py_XDECREF(temp);
9408 goto onError;
9409 }
9410 res = PyUnicode_AS_UNICODE(result)
9411 + reslen - rescnt;
9412 }
9413 if (sign) {
9414 if (fill != ' ')
9415 *res++ = sign;
9416 rescnt--;
9417 if (width > len)
9418 width--;
9419 }
9420 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9421 assert(pbuf[0] == '0');
9422 assert(pbuf[1] == c);
9423 if (fill != ' ') {
9424 *res++ = *pbuf++;
9425 *res++ = *pbuf++;
9426 }
9427 rescnt -= 2;
9428 width -= 2;
9429 if (width < 0)
9430 width = 0;
9431 len -= 2;
9432 }
9433 if (width > len && !(flags & F_LJUST)) {
9434 do {
9435 --rescnt;
9436 *res++ = fill;
9437 } while (--width > len);
9438 }
9439 if (fill == ' ') {
9440 if (sign)
9441 *res++ = sign;
9442 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9443 assert(pbuf[0] == '0');
9444 assert(pbuf[1] == c);
9445 *res++ = *pbuf++;
9446 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009447 }
9448 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 Py_UNICODE_COPY(res, pbuf, len);
9450 res += len;
9451 rescnt -= len;
9452 while (--width >= len) {
9453 --rescnt;
9454 *res++ = ' ';
9455 }
9456 if (dict && (argidx < arglen) && c != '%') {
9457 PyErr_SetString(PyExc_TypeError,
9458 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009459 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 goto onError;
9461 }
9462 Py_XDECREF(temp);
9463 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 } /* until end */
9465 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 PyErr_SetString(PyExc_TypeError,
9467 "not all arguments converted during string formatting");
9468 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 }
9470
Thomas Woutersa96affe2006-03-12 00:29:36 +00009471 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
9476 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 return (PyObject *)result;
9478
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 Py_XDECREF(result);
9481 Py_DECREF(uformat);
9482 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 }
9485 return NULL;
9486}
9487
Jeremy Hylton938ace62002-07-17 16:30:39 +00009488static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009489unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9490
Tim Peters6d6c1a32001-08-02 04:15:00 +00009491static PyObject *
9492unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9493{
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 static char *kwlist[] = {"object", "encoding", "errors", 0};
9496 char *encoding = NULL;
9497 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009498
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 if (type != &PyUnicode_Type)
9500 return unicode_subtype_new(type, args, kwds);
9501 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009503 return NULL;
9504 if (x == NULL)
9505 return (PyObject *)_PyUnicode_New(0);
9506 if (encoding == NULL && errors == NULL)
9507 return PyObject_Str(x);
9508 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009510}
9511
Guido van Rossume023fe02001-08-30 03:12:59 +00009512static PyObject *
9513unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9514{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009515 PyUnicodeObject *tmp, *pnew;
9516 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009517
Benjamin Peterson14339b62009-01-31 16:36:08 +00009518 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9519 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9520 if (tmp == NULL)
9521 return NULL;
9522 assert(PyUnicode_Check(tmp));
9523 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9524 if (pnew == NULL) {
9525 Py_DECREF(tmp);
9526 return NULL;
9527 }
9528 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9529 if (pnew->str == NULL) {
9530 _Py_ForgetReference((PyObject *)pnew);
9531 PyObject_Del(pnew);
9532 Py_DECREF(tmp);
9533 return PyErr_NoMemory();
9534 }
9535 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9536 pnew->length = n;
9537 pnew->hash = tmp->hash;
9538 Py_DECREF(tmp);
9539 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009540}
9541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009542PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009544\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009545Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009546encoding defaults to the current default string encoding.\n\
9547errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009548
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009549static PyObject *unicode_iter(PyObject *seq);
9550
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009552 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009553 "str", /* tp_name */
9554 sizeof(PyUnicodeObject), /* tp_size */
9555 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009557 (destructor)unicode_dealloc, /* tp_dealloc */
9558 0, /* tp_print */
9559 0, /* tp_getattr */
9560 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009561 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009562 unicode_repr, /* tp_repr */
9563 &unicode_as_number, /* tp_as_number */
9564 &unicode_as_sequence, /* tp_as_sequence */
9565 &unicode_as_mapping, /* tp_as_mapping */
9566 (hashfunc) unicode_hash, /* tp_hash*/
9567 0, /* tp_call*/
9568 (reprfunc) unicode_str, /* tp_str */
9569 PyObject_GenericGetAttr, /* tp_getattro */
9570 0, /* tp_setattro */
9571 0, /* tp_as_buffer */
9572 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009574 unicode_doc, /* tp_doc */
9575 0, /* tp_traverse */
9576 0, /* tp_clear */
9577 PyUnicode_RichCompare, /* tp_richcompare */
9578 0, /* tp_weaklistoffset */
9579 unicode_iter, /* tp_iter */
9580 0, /* tp_iternext */
9581 unicode_methods, /* tp_methods */
9582 0, /* tp_members */
9583 0, /* tp_getset */
9584 &PyBaseObject_Type, /* tp_base */
9585 0, /* tp_dict */
9586 0, /* tp_descr_get */
9587 0, /* tp_descr_set */
9588 0, /* tp_dictoffset */
9589 0, /* tp_init */
9590 0, /* tp_alloc */
9591 unicode_new, /* tp_new */
9592 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593};
9594
9595/* Initialize the Unicode implementation */
9596
Thomas Wouters78890102000-07-22 19:25:51 +00009597void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009599 int i;
9600
Thomas Wouters477c8d52006-05-27 19:21:47 +00009601 /* XXX - move this array to unicodectype.c ? */
9602 Py_UNICODE linebreak[] = {
9603 0x000A, /* LINE FEED */
9604 0x000D, /* CARRIAGE RETURN */
9605 0x001C, /* FILE SEPARATOR */
9606 0x001D, /* GROUP SEPARATOR */
9607 0x001E, /* RECORD SEPARATOR */
9608 0x0085, /* NEXT LINE */
9609 0x2028, /* LINE SEPARATOR */
9610 0x2029, /* PARAGRAPH SEPARATOR */
9611 };
9612
Fred Drakee4315f52000-05-09 19:53:39 +00009613 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009614 free_list = NULL;
9615 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009617 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009619
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009620 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009622 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009624
9625 /* initialize the linebreak bloom filter */
9626 bloom_linebreak = make_bloom_mask(
9627 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9628 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009629
9630 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631}
9632
9633/* Finalize the Unicode implementation */
9634
Christian Heimesa156e092008-02-16 07:38:31 +00009635int
9636PyUnicode_ClearFreeList(void)
9637{
9638 int freelist_size = numfree;
9639 PyUnicodeObject *u;
9640
9641 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009642 PyUnicodeObject *v = u;
9643 u = *(PyUnicodeObject **)u;
9644 if (v->str)
9645 PyObject_DEL(v->str);
9646 Py_XDECREF(v->defenc);
9647 PyObject_Del(v);
9648 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009649 }
9650 free_list = NULL;
9651 assert(numfree == 0);
9652 return freelist_size;
9653}
9654
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655void
Thomas Wouters78890102000-07-22 19:25:51 +00009656_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009658 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009660 Py_XDECREF(unicode_empty);
9661 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009662
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009663 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 if (unicode_latin1[i]) {
9665 Py_DECREF(unicode_latin1[i]);
9666 unicode_latin1[i] = NULL;
9667 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009668 }
Christian Heimesa156e092008-02-16 07:38:31 +00009669 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009671
Walter Dörwald16807132007-05-25 13:52:07 +00009672void
9673PyUnicode_InternInPlace(PyObject **p)
9674{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009675 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9676 PyObject *t;
9677 if (s == NULL || !PyUnicode_Check(s))
9678 Py_FatalError(
9679 "PyUnicode_InternInPlace: unicode strings only please!");
9680 /* If it's a subclass, we don't really know what putting
9681 it in the interned dict might do. */
9682 if (!PyUnicode_CheckExact(s))
9683 return;
9684 if (PyUnicode_CHECK_INTERNED(s))
9685 return;
9686 if (interned == NULL) {
9687 interned = PyDict_New();
9688 if (interned == NULL) {
9689 PyErr_Clear(); /* Don't leave an exception */
9690 return;
9691 }
9692 }
9693 /* It might be that the GetItem call fails even
9694 though the key is present in the dictionary,
9695 namely when this happens during a stack overflow. */
9696 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009699
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (t) {
9701 Py_INCREF(t);
9702 Py_DECREF(*p);
9703 *p = t;
9704 return;
9705 }
Walter Dörwald16807132007-05-25 13:52:07 +00009706
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 PyThreadState_GET()->recursion_critical = 1;
9708 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9709 PyErr_Clear();
9710 PyThreadState_GET()->recursion_critical = 0;
9711 return;
9712 }
9713 PyThreadState_GET()->recursion_critical = 0;
9714 /* The two references in interned are not counted by refcnt.
9715 The deallocator will take care of this */
9716 Py_REFCNT(s) -= 2;
9717 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009718}
9719
9720void
9721PyUnicode_InternImmortal(PyObject **p)
9722{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009723 PyUnicode_InternInPlace(p);
9724 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9725 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9726 Py_INCREF(*p);
9727 }
Walter Dörwald16807132007-05-25 13:52:07 +00009728}
9729
9730PyObject *
9731PyUnicode_InternFromString(const char *cp)
9732{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 PyObject *s = PyUnicode_FromString(cp);
9734 if (s == NULL)
9735 return NULL;
9736 PyUnicode_InternInPlace(&s);
9737 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009738}
9739
9740void _Py_ReleaseInternedUnicodeStrings(void)
9741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009742 PyObject *keys;
9743 PyUnicodeObject *s;
9744 Py_ssize_t i, n;
9745 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009746
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 if (interned == NULL || !PyDict_Check(interned))
9748 return;
9749 keys = PyDict_Keys(interned);
9750 if (keys == NULL || !PyList_Check(keys)) {
9751 PyErr_Clear();
9752 return;
9753 }
Walter Dörwald16807132007-05-25 13:52:07 +00009754
Benjamin Peterson14339b62009-01-31 16:36:08 +00009755 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9756 detector, interned unicode strings are not forcibly deallocated;
9757 rather, we give them their stolen references back, and then clear
9758 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009759
Benjamin Peterson14339b62009-01-31 16:36:08 +00009760 n = PyList_GET_SIZE(keys);
9761 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009763 for (i = 0; i < n; i++) {
9764 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9765 switch (s->state) {
9766 case SSTATE_NOT_INTERNED:
9767 /* XXX Shouldn't happen */
9768 break;
9769 case SSTATE_INTERNED_IMMORTAL:
9770 Py_REFCNT(s) += 1;
9771 immortal_size += s->length;
9772 break;
9773 case SSTATE_INTERNED_MORTAL:
9774 Py_REFCNT(s) += 2;
9775 mortal_size += s->length;
9776 break;
9777 default:
9778 Py_FatalError("Inconsistent interned string state.");
9779 }
9780 s->state = SSTATE_NOT_INTERNED;
9781 }
9782 fprintf(stderr, "total size of all interned strings: "
9783 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9784 "mortal/immortal\n", mortal_size, immortal_size);
9785 Py_DECREF(keys);
9786 PyDict_Clear(interned);
9787 Py_DECREF(interned);
9788 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009789}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009790
9791
9792/********************* Unicode Iterator **************************/
9793
9794typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009795 PyObject_HEAD
9796 Py_ssize_t it_index;
9797 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009798} unicodeiterobject;
9799
9800static void
9801unicodeiter_dealloc(unicodeiterobject *it)
9802{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009803 _PyObject_GC_UNTRACK(it);
9804 Py_XDECREF(it->it_seq);
9805 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009806}
9807
9808static int
9809unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9810{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009811 Py_VISIT(it->it_seq);
9812 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009813}
9814
9815static PyObject *
9816unicodeiter_next(unicodeiterobject *it)
9817{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009818 PyUnicodeObject *seq;
9819 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009820
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 assert(it != NULL);
9822 seq = it->it_seq;
9823 if (seq == NULL)
9824 return NULL;
9825 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009826
Benjamin Peterson14339b62009-01-31 16:36:08 +00009827 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9828 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 if (item != NULL)
9831 ++it->it_index;
9832 return item;
9833 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009834
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835 Py_DECREF(seq);
9836 it->it_seq = NULL;
9837 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009838}
9839
9840static PyObject *
9841unicodeiter_len(unicodeiterobject *it)
9842{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009843 Py_ssize_t len = 0;
9844 if (it->it_seq)
9845 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9846 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009847}
9848
9849PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9850
9851static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009852 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009855};
9856
9857PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9859 "str_iterator", /* tp_name */
9860 sizeof(unicodeiterobject), /* tp_basicsize */
9861 0, /* tp_itemsize */
9862 /* methods */
9863 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9864 0, /* tp_print */
9865 0, /* tp_getattr */
9866 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009867 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009868 0, /* tp_repr */
9869 0, /* tp_as_number */
9870 0, /* tp_as_sequence */
9871 0, /* tp_as_mapping */
9872 0, /* tp_hash */
9873 0, /* tp_call */
9874 0, /* tp_str */
9875 PyObject_GenericGetAttr, /* tp_getattro */
9876 0, /* tp_setattro */
9877 0, /* tp_as_buffer */
9878 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9879 0, /* tp_doc */
9880 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9881 0, /* tp_clear */
9882 0, /* tp_richcompare */
9883 0, /* tp_weaklistoffset */
9884 PyObject_SelfIter, /* tp_iter */
9885 (iternextfunc)unicodeiter_next, /* tp_iternext */
9886 unicodeiter_methods, /* tp_methods */
9887 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009888};
9889
9890static PyObject *
9891unicode_iter(PyObject *seq)
9892{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009893 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009894
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 if (!PyUnicode_Check(seq)) {
9896 PyErr_BadInternalCall();
9897 return NULL;
9898 }
9899 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9900 if (it == NULL)
9901 return NULL;
9902 it->it_index = 0;
9903 Py_INCREF(seq);
9904 it->it_seq = (PyUnicodeObject *)seq;
9905 _PyObject_GC_TRACK(it);
9906 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009907}
9908
Martin v. Löwis5b222132007-06-10 09:51:05 +00009909size_t
9910Py_UNICODE_strlen(const Py_UNICODE *u)
9911{
9912 int res = 0;
9913 while(*u++)
9914 res++;
9915 return res;
9916}
9917
9918Py_UNICODE*
9919Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9920{
9921 Py_UNICODE *u = s1;
9922 while ((*u++ = *s2++));
9923 return s1;
9924}
9925
9926Py_UNICODE*
9927Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9928{
9929 Py_UNICODE *u = s1;
9930 while ((*u++ = *s2++))
9931 if (n-- == 0)
9932 break;
9933 return s1;
9934}
9935
9936int
9937Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9938{
9939 while (*s1 && *s2 && *s1 == *s2)
9940 s1++, s2++;
9941 if (*s1 && *s2)
9942 return (*s1 < *s2) ? -1 : +1;
9943 if (*s1)
9944 return 1;
9945 if (*s2)
9946 return -1;
9947 return 0;
9948}
9949
9950Py_UNICODE*
9951Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9952{
9953 const Py_UNICODE *p;
9954 for (p = s; *p; p++)
9955 if (*p == c)
9956 return (Py_UNICODE*)p;
9957 return NULL;
9958}
9959
Victor Stinner331ea922010-08-10 16:37:20 +00009960Py_UNICODE*
9961Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
9962{
9963 const Py_UNICODE *p;
9964 p = s + Py_UNICODE_strlen(s);
9965 while (p != s) {
9966 p--;
9967 if (*p == c)
9968 return (Py_UNICODE*)p;
9969 }
9970 return NULL;
9971}
9972
Martin v. Löwis5b222132007-06-10 09:51:05 +00009973
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009974#ifdef __cplusplus
9975}
9976#endif