blob: 753b46515cc31fb18d2e9d9207ce43c66704a780 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000310 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000313
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 return 0;
315}
316
317/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000318 Ux0000 terminated; some code (e.g. new_identifier)
319 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320
321 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
324*/
325
326static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000327PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328{
329 register PyUnicodeObject *unicode;
330
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 if (length == 0 && unicode_empty != NULL) {
333 Py_INCREF(unicode_empty);
334 return unicode_empty;
335 }
336
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000337 /* Ensure we won't overflow the size. */
338 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
339 return (PyUnicodeObject *)PyErr_NoMemory();
340 }
341
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000343 if (free_list) {
344 unicode = free_list;
345 free_list = *(PyUnicodeObject **)unicode;
346 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 if (unicode->str) {
348 /* Keep-Alive optimization: we only upsize the buffer,
349 never downsize it. */
350 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000351 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 PyObject_DEL(unicode->str);
353 unicode->str = NULL;
354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000359 }
360 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (unicode == NULL)
366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
368 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
370
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000371 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 PyErr_NoMemory();
373 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000374 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000376 * the caller fails before initializing str -- unicode_resize()
377 * reads str[0], and the Keep-Alive optimization can keep memory
378 * allocated for str alive across a call to unicode_dealloc(unicode).
379 * We don't want unicode_resize to read uninitialized memory in
380 * that case.
381 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000382 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000386 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000387 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000391 /* XXX UNREF/NEWREF interface should be more symmetrical */
392 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000393 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000394 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396}
397
398static
Guido van Rossum9475a232001-10-05 20:51:39 +0000399void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
Walter Dörwald16807132007-05-25 13:52:07 +0000401 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000402 case SSTATE_NOT_INTERNED:
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_MORTAL:
406 /* revive dead object temporarily for DelItem */
407 Py_REFCNT(unicode) = 3;
408 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
409 Py_FatalError(
410 "deletion of interned string failed");
411 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000412
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 case SSTATE_INTERNED_IMMORTAL:
414 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000415
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 default:
417 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000418 }
419
Guido van Rossum604ddf82001-12-06 20:03:56 +0000420 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000422 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000423 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
424 PyObject_DEL(unicode->str);
425 unicode->str = NULL;
426 unicode->length = 0;
427 }
428 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000429 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000430 }
431 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000432 *(PyUnicodeObject **)unicode = free_list;
433 free_list = unicode;
434 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyObject_DEL(unicode->str);
438 Py_XDECREF(unicode->defenc);
439 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 }
441}
442
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443static
444int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445{
446 register PyUnicodeObject *v;
447
448 /* Argument checks */
449 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 PyErr_BadInternalCall();
451 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000453 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000454 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 PyErr_BadInternalCall();
456 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
458
459 /* Resizing unicode_empty and single character objects is not
460 possible since these are being shared. We simply return a fresh
461 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000462 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 (v == unicode_empty || v->length == 1)) {
464 PyUnicodeObject *w = _PyUnicode_New(length);
465 if (w == NULL)
466 return -1;
467 Py_UNICODE_COPY(w->str, v->str,
468 length < v->length ? length : v->length);
469 Py_DECREF(*unicode);
470 *unicode = w;
471 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
473
474 /* Note that we don't have to modify *unicode for unshared Unicode
475 objects, since we can modify them in-place. */
476 return unicode_resize(v, length);
477}
478
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000479int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
480{
481 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
482}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486{
487 PyUnicodeObject *unicode;
488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000489 /* If the Unicode data is known at construction time, we can apply
490 some optimizations which share commonly used objects. */
491 if (u != NULL) {
492
Benjamin Peterson29060642009-01-31 22:14:21 +0000493 /* Optimization for empty strings */
494 if (size == 0 && unicode_empty != NULL) {
495 Py_INCREF(unicode_empty);
496 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000497 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000498
499 /* Single character Unicode objects in the Latin-1 range are
500 shared when using this constructor */
501 if (size == 1 && *u < 256) {
502 unicode = unicode_latin1[*u];
503 if (!unicode) {
504 unicode = _PyUnicode_New(1);
505 if (!unicode)
506 return NULL;
507 unicode->str[0] = *u;
508 unicode_latin1[*u] = unicode;
509 }
510 Py_INCREF(unicode);
511 return (PyObject *)unicode;
512 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518
519 /* Copy the Unicode data into the new object */
520 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522
523 return (PyObject *)unicode;
524}
525
Walter Dörwaldd2034312007-05-18 16:29:38 +0000526PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527{
528 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 if (size < 0) {
531 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 return NULL;
534 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000535
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000537 some optimizations which share commonly used objects.
538 Also, this means the input must be UTF-8, so fall back to the
539 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 if (u != NULL) {
541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542 /* Optimization for empty strings */
543 if (size == 0 && unicode_empty != NULL) {
544 Py_INCREF(unicode_empty);
545 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000546 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000547
548 /* Single characters are shared when using this constructor.
549 Restrict to ASCII, since the input must be UTF-8. */
550 if (size == 1 && Py_CHARMASK(*u) < 128) {
551 unicode = unicode_latin1[Py_CHARMASK(*u)];
552 if (!unicode) {
553 unicode = _PyUnicode_New(1);
554 if (!unicode)
555 return NULL;
556 unicode->str[0] = Py_CHARMASK(*u);
557 unicode_latin1[Py_CHARMASK(*u)] = unicode;
558 }
559 Py_INCREF(unicode);
560 return (PyObject *)unicode;
561 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000562
563 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 }
565
Walter Dörwald55507312007-05-18 13:12:10 +0000566 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 if (!unicode)
568 return NULL;
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570 return (PyObject *)unicode;
571}
572
Walter Dörwaldd2034312007-05-18 16:29:38 +0000573PyObject *PyUnicode_FromString(const char *u)
574{
575 size_t size = strlen(u);
576 if (size > PY_SSIZE_T_MAX) {
577 PyErr_SetString(PyExc_OverflowError, "input too long");
578 return NULL;
579 }
580
581 return PyUnicode_FromStringAndSize(u, size);
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584#ifdef HAVE_WCHAR_H
585
Mark Dickinson081dfee2009-03-18 14:47:41 +0000586#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
587# define CONVERT_WCHAR_TO_SURROGATES
588#endif
589
590#ifdef CONVERT_WCHAR_TO_SURROGATES
591
592/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
593 to convert from UTF32 to UTF16. */
594
595PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
596 Py_ssize_t size)
597{
598 PyUnicodeObject *unicode;
599 register Py_ssize_t i;
600 Py_ssize_t alloc;
601 const wchar_t *orig_w;
602
603 if (w == NULL) {
604 if (size == 0)
605 return PyUnicode_FromStringAndSize(NULL, 0);
606 PyErr_BadInternalCall();
607 return NULL;
608 }
609
610 if (size == -1) {
611 size = wcslen(w);
612 }
613
614 alloc = size;
615 orig_w = w;
616 for (i = size; i > 0; i--) {
617 if (*w > 0xFFFF)
618 alloc++;
619 w++;
620 }
621 w = orig_w;
622 unicode = _PyUnicode_New(alloc);
623 if (!unicode)
624 return NULL;
625
626 /* Copy the wchar_t data into the new object */
627 {
628 register Py_UNICODE *u;
629 u = PyUnicode_AS_UNICODE(unicode);
630 for (i = size; i > 0; i--) {
631 if (*w > 0xFFFF) {
632 wchar_t ordinal = *w++;
633 ordinal -= 0x10000;
634 *u++ = 0xD800 | (ordinal >> 10);
635 *u++ = 0xDC00 | (ordinal & 0x3FF);
636 }
637 else
638 *u++ = *w++;
639 }
640 }
641 return (PyObject *)unicode;
642}
643
644#else
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000647 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648{
649 PyUnicodeObject *unicode;
650
651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == 0)
653 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 PyErr_BadInternalCall();
655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 }
657
Martin v. Löwis790465f2008-04-05 20:41:37 +0000658 if (size == -1) {
659 size = wcslen(w);
660 }
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = _PyUnicode_New(size);
663 if (!unicode)
664 return NULL;
665
666 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000667#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000669#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 register Py_UNICODE *u;
672 register Py_ssize_t i;
673 u = PyUnicode_AS_UNICODE(unicode);
674 for (i = size; i > 0; i--)
675 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000676 }
677#endif
678
679 return (PyObject *)unicode;
680}
681
Mark Dickinson081dfee2009-03-18 14:47:41 +0000682#endif /* CONVERT_WCHAR_TO_SURROGATES */
683
684#undef CONVERT_WCHAR_TO_SURROGATES
685
Walter Dörwald346737f2007-05-31 10:44:43 +0000686static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000687makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
688 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000689{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000690 *fmt++ = '%';
691 if (width) {
692 if (zeropad)
693 *fmt++ = '0';
694 fmt += sprintf(fmt, "%d", width);
695 }
696 if (precision)
697 fmt += sprintf(fmt, ".%d", precision);
698 if (longflag)
699 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000700 else if (longlongflag) {
701 /* longlongflag should only ever be nonzero on machines with
702 HAVE_LONG_LONG defined */
703#ifdef HAVE_LONG_LONG
704 char *f = PY_FORMAT_LONG_LONG;
705 while (*f)
706 *fmt++ = *f++;
707#else
708 /* we shouldn't ever get here */
709 assert(0);
710 *fmt++ = 'l';
711#endif
712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 else if (size_tflag) {
714 char *f = PY_FORMAT_SIZE_T;
715 while (*f)
716 *fmt++ = *f++;
717 }
718 *fmt++ = c;
719 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000720}
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
723
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000724/* size of fixed-size buffer for formatting single arguments */
725#define ITEM_BUFFER_LEN 21
726/* maximum number of characters required for output of %ld. 21 characters
727 allows for 64-bit integers (in decimal) and an optional sign. */
728#define MAX_LONG_CHARS 21
729/* maximum number of characters required for output of %lld.
730 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
731 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
732#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
733
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734PyObject *
735PyUnicode_FromFormatV(const char *format, va_list vargs)
736{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000737 va_list count;
738 Py_ssize_t callcount = 0;
739 PyObject **callresults = NULL;
740 PyObject **callresult = NULL;
741 Py_ssize_t n = 0;
742 int width = 0;
743 int precision = 0;
744 int zeropad;
745 const char* f;
746 Py_UNICODE *s;
747 PyObject *string;
748 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 /* use abuffer instead of buffer, if we need more space
751 * (which can happen if there's a format specifier with width). */
752 char *abuffer = NULL;
753 char *realbuffer;
754 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000755 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000758 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 /* step 1: count the number of %S/%R/%A/%s format specifications
760 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
761 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
762 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000763 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000764 if (*f == '%') {
765 if (*(f+1)=='%')
766 continue;
767 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
768 ++callcount;
769 while (ISDIGIT((unsigned)*f))
770 width = (width*10) + *f++ - '0';
771 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
772 ;
773 if (*f == 's')
774 ++callcount;
775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 }
777 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000778 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000779 if (callcount) {
780 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
781 if (!callresults) {
782 PyErr_NoMemory();
783 return NULL;
784 }
785 callresult = callresults;
786 }
787 /* step 3: figure out how large a buffer we need */
788 for (f = format; *f; f++) {
789 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000790#ifdef HAVE_LONG_LONG
791 int longlongflag = 0;
792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 const char* p = f;
794 width = 0;
795 while (ISDIGIT((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
798 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
801 * they don't affect the amount of space we reserve.
802 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000803 if (*f == 'l') {
804 if (f[1] == 'd' || f[1] == 'u') {
805 ++f;
806 }
807#ifdef HAVE_LONG_LONG
808 else if (f[1] == 'l' &&
809 (f[2] == 'd' || f[2] == 'u')) {
810 longlongflag = 1;
811 f += 2;
812 }
813#endif
814 }
815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000816 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Benjamin Peterson14339b62009-01-31 16:36:08 +0000819 switch (*f) {
820 case 'c':
821 (void)va_arg(count, int);
822 /* fall through... */
823 case '%':
824 n++;
825 break;
826 case 'd': case 'u': case 'i': case 'x':
827 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000828#ifdef HAVE_LONG_LONG
829 if (longlongflag) {
830 if (width < MAX_LONG_LONG_CHARS)
831 width = MAX_LONG_LONG_CHARS;
832 }
833 else
834#endif
835 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
836 including sign. Decimal takes the most space. This
837 isn't enough for octal. If a width is specified we
838 need more (which we allocate later). */
839 if (width < MAX_LONG_CHARS)
840 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000842 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 if (abuffersize < width)
844 abuffersize = width;
845 break;
846 case 's':
847 {
848 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000849 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000850 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
851 if (!str)
852 goto fail;
853 n += PyUnicode_GET_SIZE(str);
854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 break;
857 }
858 case 'U':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 assert(obj && PyUnicode_Check(obj));
862 n += PyUnicode_GET_SIZE(obj);
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(count, PyObject *);
868 const char *str = va_arg(count, const char *);
869 assert(obj || str);
870 assert(!obj || PyUnicode_Check(obj));
871 if (obj)
872 n += PyUnicode_GET_SIZE(obj);
873 else
874 n += strlen(str);
875 break;
876 }
877 case 'S':
878 {
879 PyObject *obj = va_arg(count, PyObject *);
880 PyObject *str;
881 assert(obj);
882 str = PyObject_Str(obj);
883 if (!str)
884 goto fail;
885 n += PyUnicode_GET_SIZE(str);
886 /* Remember the str and switch to the next slot */
887 *callresult++ = str;
888 break;
889 }
890 case 'R':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *repr;
894 assert(obj);
895 repr = PyObject_Repr(obj);
896 if (!repr)
897 goto fail;
898 n += PyUnicode_GET_SIZE(repr);
899 /* Remember the repr and switch to the next slot */
900 *callresult++ = repr;
901 break;
902 }
903 case 'A':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *ascii;
907 assert(obj);
908 ascii = PyObject_ASCII(obj);
909 if (!ascii)
910 goto fail;
911 n += PyUnicode_GET_SIZE(ascii);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = ascii;
914 break;
915 }
916 case 'p':
917 (void) va_arg(count, int);
918 /* maximum 64-bit pointer representation:
919 * 0xffffffffffffffff
920 * so 19 characters is enough.
921 * XXX I count 18 -- what's the extra for?
922 */
923 n += 19;
924 break;
925 default:
926 /* if we stumble upon an unknown
927 formatting code, copy the rest of
928 the format string to the output
929 string. (we cannot just skip the
930 code, since there's no way to know
931 what's in the argument list) */
932 n += strlen(p);
933 goto expand;
934 }
935 } else
936 n++;
937 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000939 if (abuffersize > ITEM_BUFFER_LEN) {
940 /* add 1 for sprintf's trailing null byte */
941 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000942 if (!abuffer) {
943 PyErr_NoMemory();
944 goto fail;
945 }
946 realbuffer = abuffer;
947 }
948 else
949 realbuffer = buffer;
950 /* step 4: fill the buffer */
951 /* Since we've analyzed how much space we need for the worst case,
952 we don't have to resize the string.
953 There can be no errors beyond this point. */
954 string = PyUnicode_FromUnicode(NULL, n);
955 if (!string)
956 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 s = PyUnicode_AS_UNICODE(string);
959 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 for (f = format; *f; f++) {
962 if (*f == '%') {
963 const char* p = f++;
964 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000965 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 int size_tflag = 0;
967 zeropad = (*f == '0');
968 /* parse the width.precision part */
969 width = 0;
970 while (ISDIGIT((unsigned)*f))
971 width = (width*10) + *f++ - '0';
972 precision = 0;
973 if (*f == '.') {
974 f++;
975 while (ISDIGIT((unsigned)*f))
976 precision = (precision*10) + *f++ - '0';
977 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 /* Handle %ld, %lu, %lld and %llu. */
979 if (*f == 'l') {
980 if (f[1] == 'd' || f[1] == 'u') {
981 longflag = 1;
982 ++f;
983 }
984#ifdef HAVE_LONG_LONG
985 else if (f[1] == 'l' &&
986 (f[2] == 'd' || f[2] == 'u')) {
987 longlongflag = 1;
988 f += 2;
989 }
990#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000991 }
992 /* handle the size_t flag. */
993 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
994 size_tflag = 1;
995 ++f;
996 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000997
Benjamin Peterson14339b62009-01-31 16:36:08 +0000998 switch (*f) {
999 case 'c':
1000 *s++ = va_arg(vargs, int);
1001 break;
1002 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1004 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 if (longflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007#ifdef HAVE_LONG_LONG
1008 else if (longlongflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 else if (size_tflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1013 else
1014 sprintf(realbuffer, fmt, va_arg(vargs, int));
1015 appendstring(realbuffer);
1016 break;
1017 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001018 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1019 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 if (longflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022#ifdef HAVE_LONG_LONG
1023 else if (longlongflag)
1024 sprintf(realbuffer, fmt, va_arg(vargs,
1025 unsigned PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001039 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001040 sprintf(realbuffer, fmt, va_arg(vargs, int));
1041 appendstring(realbuffer);
1042 break;
1043 case 's':
1044 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001045 /* unused, since we already have the result */
1046 (void) va_arg(vargs, char *);
1047 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1048 PyUnicode_GET_SIZE(*callresult));
1049 s += PyUnicode_GET_SIZE(*callresult);
1050 /* We're done with the unicode()/repr() => forget it */
1051 Py_DECREF(*callresult);
1052 /* switch to next unicode()/repr() result */
1053 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 break;
1055 }
1056 case 'U':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1060 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1061 s += size;
1062 break;
1063 }
1064 case 'V':
1065 {
1066 PyObject *obj = va_arg(vargs, PyObject *);
1067 const char *str = va_arg(vargs, const char *);
1068 if (obj) {
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 } else {
1073 appendstring(str);
1074 }
1075 break;
1076 }
1077 case 'S':
1078 case 'R':
1079 {
1080 Py_UNICODE *ucopy;
1081 Py_ssize_t usize;
1082 Py_ssize_t upos;
1083 /* unused, since we already have the result */
1084 (void) va_arg(vargs, PyObject *);
1085 ucopy = PyUnicode_AS_UNICODE(*callresult);
1086 usize = PyUnicode_GET_SIZE(*callresult);
1087 for (upos = 0; upos<usize;)
1088 *s++ = ucopy[upos++];
1089 /* We're done with the unicode()/repr() => forget it */
1090 Py_DECREF(*callresult);
1091 /* switch to next unicode()/repr() result */
1092 ++callresult;
1093 break;
1094 }
1095 case 'p':
1096 sprintf(buffer, "%p", va_arg(vargs, void*));
1097 /* %p is ill-defined: ensure leading 0x. */
1098 if (buffer[1] == 'X')
1099 buffer[1] = 'x';
1100 else if (buffer[1] != 'x') {
1101 memmove(buffer+2, buffer, strlen(buffer)+1);
1102 buffer[0] = '0';
1103 buffer[1] = 'x';
1104 }
1105 appendstring(buffer);
1106 break;
1107 case '%':
1108 *s++ = '%';
1109 break;
1110 default:
1111 appendstring(p);
1112 goto end;
1113 }
1114 } else
1115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 wchar_t *w,
1159 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160{
1161 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 PyErr_BadInternalCall();
1163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001165
1166 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
Daniel Stutzbach8515eae2010-08-24 21:57:33 +00001170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 memcpy(w, unicode->str, size * sizeof(wchar_t));
1172#else
1173 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 register Py_UNICODE *u;
1175 register Py_ssize_t i;
1176 u = PyUnicode_AS_UNICODE(unicode);
1177 for (i = size; i > 0; i--)
1178 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180#endif
1181
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001182 if (size > PyUnicode_GET_SIZE(unicode))
1183 return PyUnicode_GET_SIZE(unicode);
1184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186}
1187
1188#endif
1189
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190PyObject *PyUnicode_FromOrdinal(int ordinal)
1191{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001192 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 PyErr_SetString(PyExc_ValueError,
1196 "chr() arg not in range(0x110000)");
1197 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001199
1200#ifndef Py_UNICODE_WIDE
1201 if (ordinal > 0xffff) {
1202 ordinal -= 0x10000;
1203 s[0] = 0xD800 | (ordinal >> 10);
1204 s[1] = 0xDC00 | (ordinal & 0x3FF);
1205 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001206 }
1207#endif
1208
Hye-Shik Chang40574832004-04-06 07:24:51 +00001209 s[0] = (Py_UNICODE)ordinal;
1210 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_FromObject(register PyObject *obj)
1214{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001215 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 Py_INCREF(obj);
1219 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001220 }
1221 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 /* For a Unicode subtype that's not a Unicode object,
1223 return a true Unicode object with the same data. */
1224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1225 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001227 PyErr_Format(PyExc_TypeError,
1228 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001229 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001230 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231}
1232
1233PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 const char *encoding,
1235 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001237 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001238 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001241 PyErr_BadInternalCall();
1242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001244
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001245 /* Decoding bytes objects is the most common case and should be fast */
1246 if (PyBytes_Check(obj)) {
1247 if (PyBytes_GET_SIZE(obj) == 0) {
1248 Py_INCREF(unicode_empty);
1249 v = (PyObject *) unicode_empty;
1250 }
1251 else {
1252 v = PyUnicode_Decode(
1253 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1254 encoding, errors);
1255 }
1256 return v;
1257 }
1258
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001259 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001260 PyErr_SetString(PyExc_TypeError,
1261 "decoding str is not supported");
1262 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001263 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001264
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001265 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1266 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1267 PyErr_Format(PyExc_TypeError,
1268 "coercing to str: need bytes, bytearray "
1269 "or buffer-like object, %.80s found",
1270 Py_TYPE(obj)->tp_name);
1271 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001272 }
Tim Petersced69f82003-09-16 20:30:58 +00001273
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001274 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001275 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001276 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 }
Tim Petersced69f82003-09-16 20:30:58 +00001278 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001279 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001280
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001281 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283}
1284
Victor Stinner600d3be2010-06-10 12:00:55 +00001285/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001286 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1287 1 on success. */
1288static int
1289normalize_encoding(const char *encoding,
1290 char *lower,
1291 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001293 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001294 char *l;
1295 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001296
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001297 e = encoding;
1298 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001299 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001300 while (*e) {
1301 if (l == l_end)
1302 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 if (ISUPPER(*e)) {
1304 *l++ = TOLOWER(*e++);
1305 }
1306 else if (*e == '_') {
1307 *l++ = '-';
1308 e++;
1309 }
1310 else {
1311 *l++ = *e++;
1312 }
1313 }
1314 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001315 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001316}
1317
1318PyObject *PyUnicode_Decode(const char *s,
1319 Py_ssize_t size,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *buffer = NULL, *unicode;
1324 Py_buffer info;
1325 char lower[11]; /* Enough for any encoding shortcut */
1326
1327 if (encoding == NULL)
1328 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001329
1330 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001331 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1332 if (strcmp(lower, "utf-8") == 0)
1333 return PyUnicode_DecodeUTF8(s, size, errors);
1334 else if ((strcmp(lower, "latin-1") == 0) ||
1335 (strcmp(lower, "iso-8859-1") == 0))
1336 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001337#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001338 else if (strcmp(lower, "mbcs") == 0)
1339 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001340#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001341 else if (strcmp(lower, "ascii") == 0)
1342 return PyUnicode_DecodeASCII(s, size, errors);
1343 else if (strcmp(lower, "utf-16") == 0)
1344 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1345 else if (strcmp(lower, "utf-32") == 0)
1346 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348
1349 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001350 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001351 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001352 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001353 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 if (buffer == NULL)
1355 goto onError;
1356 unicode = PyCodec_Decode(buffer, encoding, errors);
1357 if (unicode == NULL)
1358 goto onError;
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001361 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001362 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 Py_DECREF(unicode);
1364 goto onError;
1365 }
1366 Py_DECREF(buffer);
1367 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Benjamin Peterson29060642009-01-31 22:14:21 +00001369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 Py_XDECREF(buffer);
1371 return NULL;
1372}
1373
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001374PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1375 const char *encoding,
1376 const char *errors)
1377{
1378 PyObject *v;
1379
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 goto onError;
1383 }
1384
1385 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001387
1388 /* Decode via the codec registry */
1389 v = PyCodec_Decode(unicode, encoding, errors);
1390 if (v == NULL)
1391 goto onError;
1392 return v;
1393
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001395 return NULL;
1396}
1397
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001398PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1399 const char *encoding,
1400 const char *errors)
1401{
1402 PyObject *v;
1403
1404 if (!PyUnicode_Check(unicode)) {
1405 PyErr_BadArgument();
1406 goto onError;
1407 }
1408
1409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001411
1412 /* Decode via the codec registry */
1413 v = PyCodec_Decode(unicode, encoding, errors);
1414 if (v == NULL)
1415 goto onError;
1416 if (!PyUnicode_Check(v)) {
1417 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001418 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001419 Py_TYPE(v)->tp_name);
1420 Py_DECREF(v);
1421 goto onError;
1422 }
1423 return v;
1424
Benjamin Peterson29060642009-01-31 22:14:21 +00001425 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001426 return NULL;
1427}
1428
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001430 Py_ssize_t size,
1431 const char *encoding,
1432 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433{
1434 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001435
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 unicode = PyUnicode_FromUnicode(s, size);
1437 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1440 Py_DECREF(unicode);
1441 return v;
1442}
1443
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001444PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1445 const char *encoding,
1446 const char *errors)
1447{
1448 PyObject *v;
1449
1450 if (!PyUnicode_Check(unicode)) {
1451 PyErr_BadArgument();
1452 goto onError;
1453 }
1454
1455 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001456 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001457
1458 /* Encode via the codec registry */
1459 v = PyCodec_Encode(unicode, encoding, errors);
1460 if (v == NULL)
1461 goto onError;
1462 return v;
1463
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001465 return NULL;
1466}
1467
Victor Stinnerae6265f2010-05-15 16:27:27 +00001468PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1469{
Victor Stinner313a1202010-06-11 23:56:51 +00001470 if (Py_FileSystemDefaultEncoding) {
1471#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1472 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1473 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1474 PyUnicode_GET_SIZE(unicode),
1475 NULL);
1476#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001477 return PyUnicode_AsEncodedString(unicode,
1478 Py_FileSystemDefaultEncoding,
1479 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001480 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001481 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001482 PyUnicode_GET_SIZE(unicode),
1483 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001484}
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1487 const char *encoding,
1488 const char *errors)
1489{
1490 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001491 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001492
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 if (!PyUnicode_Check(unicode)) {
1494 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
Fred Drakee4315f52000-05-09 19:53:39 +00001497
Tim Petersced69f82003-09-16 20:30:58 +00001498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001500
1501 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001502 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1503 if (strcmp(lower, "utf-8") == 0)
1504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 errors);
1507 else if ((strcmp(lower, "latin-1") == 0) ||
1508 (strcmp(lower, "iso-8859-1") == 0))
1509 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "mbcs") == 0)
1514 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001517#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001518 else if (strcmp(lower, "ascii") == 0)
1519 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1520 PyUnicode_GET_SIZE(unicode),
1521 errors);
1522 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001523 /* During bootstrap, we may need to find the encodings
1524 package, to load the file system encoding, and require the
1525 file system encoding in order to load the encodings
1526 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001527
Victor Stinner59e62db2010-05-15 13:14:32 +00001528 Break out of this dependency by assuming that the path to
1529 the encodings module is ASCII-only. XXX could try wcstombs
1530 instead, if the file system encoding is the locale's
1531 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001532 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001533 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1534 !PyThreadState_GET()->interp->codecs_initialized)
1535 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1536 PyUnicode_GET_SIZE(unicode),
1537 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538
1539 /* Encode via the codec registry */
1540 v = PyCodec_Encode(unicode, encoding, errors);
1541 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001542 return NULL;
1543
1544 /* The normal path */
1545 if (PyBytes_Check(v))
1546 return v;
1547
1548 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001549 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001550 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001551 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001552
1553 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1554 "encoder %s returned bytearray instead of bytes",
1555 encoding);
1556 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001557 Py_DECREF(v);
1558 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001561 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1562 Py_DECREF(v);
1563 return b;
1564 }
1565
1566 PyErr_Format(PyExc_TypeError,
1567 "encoder did not return a bytes object (type=%.400s)",
1568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570 return NULL;
1571}
1572
1573PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1574 const char *encoding,
1575 const char *errors)
1576{
1577 PyObject *v;
1578
1579 if (!PyUnicode_Check(unicode)) {
1580 PyErr_BadArgument();
1581 goto onError;
1582 }
1583
1584 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001586
1587 /* Encode via the codec registry */
1588 v = PyCodec_Encode(unicode, encoding, errors);
1589 if (v == NULL)
1590 goto onError;
1591 if (!PyUnicode_Check(v)) {
1592 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001593 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001594 Py_TYPE(v)->tp_name);
1595 Py_DECREF(v);
1596 goto onError;
1597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001599
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 return NULL;
1602}
1603
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001604PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001606{
1607 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001608 if (v)
1609 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001610 if (errors != NULL)
1611 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001612 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001613 PyUnicode_GET_SIZE(unicode),
1614 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001615 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001616 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001617 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 return v;
1619}
1620
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001622PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001623 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001624 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1625}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001626
Christian Heimes5894ba72007-11-04 11:43:14 +00001627PyObject*
1628PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1629{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1631 can be undefined. If it is case, decode using UTF-8. The following assumes
1632 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1633 bootstrapping process where the codecs aren't ready yet.
1634 */
1635 if (Py_FileSystemDefaultEncoding) {
1636#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001637 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001638 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001639 }
1640#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001641 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001642 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001643 }
1644#endif
1645 return PyUnicode_Decode(s, size,
1646 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001647 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001648 }
1649 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001650 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001651 }
1652}
1653
Martin v. Löwis011e8422009-05-05 04:43:17 +00001654
1655int
1656PyUnicode_FSConverter(PyObject* arg, void* addr)
1657{
1658 PyObject *output = NULL;
1659 Py_ssize_t size;
1660 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001661 if (arg == NULL) {
1662 Py_DECREF(*(PyObject**)addr);
1663 return 1;
1664 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001665 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001666 output = arg;
1667 Py_INCREF(output);
1668 }
1669 else {
1670 arg = PyUnicode_FromObject(arg);
1671 if (!arg)
1672 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001673 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001674 Py_DECREF(arg);
1675 if (!output)
1676 return 0;
1677 if (!PyBytes_Check(output)) {
1678 Py_DECREF(output);
1679 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1680 return 0;
1681 }
1682 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001683 size = PyBytes_GET_SIZE(output);
1684 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001685 if (size != strlen(data)) {
1686 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1687 Py_DECREF(output);
1688 return 0;
1689 }
1690 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001691 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001692}
1693
1694
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001695int
1696PyUnicode_FSDecoder(PyObject* arg, void* addr)
1697{
1698 PyObject *output = NULL;
1699 Py_ssize_t size;
1700 void *data;
1701 if (arg == NULL) {
1702 Py_DECREF(*(PyObject**)addr);
1703 return 1;
1704 }
1705 if (PyUnicode_Check(arg)) {
1706 output = arg;
1707 Py_INCREF(output);
1708 }
1709 else {
1710 arg = PyBytes_FromObject(arg);
1711 if (!arg)
1712 return 0;
1713 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1714 PyBytes_GET_SIZE(arg));
1715 Py_DECREF(arg);
1716 if (!output)
1717 return 0;
1718 if (!PyUnicode_Check(output)) {
1719 Py_DECREF(output);
1720 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1721 return 0;
1722 }
1723 }
1724 size = PyUnicode_GET_SIZE(output);
1725 data = PyUnicode_AS_UNICODE(output);
1726 if (size != Py_UNICODE_strlen(data)) {
1727 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1728 Py_DECREF(output);
1729 return 0;
1730 }
1731 *(PyObject**)addr = output;
1732 return Py_CLEANUP_SUPPORTED;
1733}
1734
1735
Martin v. Löwis5b222132007-06-10 09:51:05 +00001736char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001737_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001738{
Christian Heimesf3863112007-11-22 07:46:41 +00001739 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001740 if (!PyUnicode_Check(unicode)) {
1741 PyErr_BadArgument();
1742 return NULL;
1743 }
Christian Heimesf3863112007-11-22 07:46:41 +00001744 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1745 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001746 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001747 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001748 *psize = PyBytes_GET_SIZE(bytes);
1749 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001750}
1751
1752char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001753_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001754{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001755 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001756}
1757
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1759{
1760 if (!PyUnicode_Check(unicode)) {
1761 PyErr_BadArgument();
1762 goto onError;
1763 }
1764 return PyUnicode_AS_UNICODE(unicode);
1765
Benjamin Peterson29060642009-01-31 22:14:21 +00001766 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 return NULL;
1768}
1769
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771{
1772 if (!PyUnicode_Check(unicode)) {
1773 PyErr_BadArgument();
1774 goto onError;
1775 }
1776 return PyUnicode_GET_SIZE(unicode);
1777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 return -1;
1780}
1781
Thomas Wouters78890102000-07-22 19:25:51 +00001782const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001783{
1784 return unicode_default_encoding;
1785}
1786
1787int PyUnicode_SetDefaultEncoding(const char *encoding)
1788{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001789 if (strcmp(encoding, unicode_default_encoding) != 0) {
1790 PyErr_Format(PyExc_ValueError,
1791 "Can only set default encoding to %s",
1792 unicode_default_encoding);
1793 return -1;
1794 }
Fred Drakee4315f52000-05-09 19:53:39 +00001795 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001796}
1797
Victor Stinner554f3f02010-06-16 23:33:54 +00001798/* create or adjust a UnicodeDecodeError */
1799static void
1800make_decode_exception(PyObject **exceptionObject,
1801 const char *encoding,
1802 const char *input, Py_ssize_t length,
1803 Py_ssize_t startpos, Py_ssize_t endpos,
1804 const char *reason)
1805{
1806 if (*exceptionObject == NULL) {
1807 *exceptionObject = PyUnicodeDecodeError_Create(
1808 encoding, input, length, startpos, endpos, reason);
1809 }
1810 else {
1811 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1812 goto onError;
1813 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1814 goto onError;
1815 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1816 goto onError;
1817 }
1818 return;
1819
1820onError:
1821 Py_DECREF(*exceptionObject);
1822 *exceptionObject = NULL;
1823}
1824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825/* error handling callback helper:
1826 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001827 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 and adjust various state variables.
1829 return 0 on success, -1 on error
1830*/
1831
1832static
1833int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 const char *encoding, const char *reason,
1835 const char **input, const char **inend, Py_ssize_t *startinpos,
1836 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1837 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001839 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001840
1841 PyObject *restuple = NULL;
1842 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001843 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001844 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001845 Py_ssize_t requiredsize;
1846 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001848 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001849 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 int res = -1;
1851
1852 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 *errorHandler = PyCodec_LookupError(errors);
1854 if (*errorHandler == NULL)
1855 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 }
1857
Victor Stinner554f3f02010-06-16 23:33:54 +00001858 make_decode_exception(exceptionObject,
1859 encoding,
1860 *input, *inend - *input,
1861 *startinpos, *endinpos,
1862 reason);
1863 if (*exceptionObject == NULL)
1864 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865
1866 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1867 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001868 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001870 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 }
1873 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001874 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001875
1876 /* Copy back the bytes variables, which might have been modified by the
1877 callback */
1878 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1879 if (!inputobj)
1880 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001881 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001882 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001883 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001884 *input = PyBytes_AS_STRING(inputobj);
1885 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001886 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001887 /* we can DECREF safely, as the exception has another reference,
1888 so the object won't go away. */
1889 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001890
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001892 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001893 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001894 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1895 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001896 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001897
1898 /* need more space? (at least enough for what we
1899 have+the replacement+the rest of the string (starting
1900 at the new input position), so we won't have to check space
1901 when there are no errors in the rest of the string) */
1902 repptr = PyUnicode_AS_UNICODE(repunicode);
1903 repsize = PyUnicode_GET_SIZE(repunicode);
1904 requiredsize = *outpos + repsize + insize-newpos;
1905 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 if (requiredsize<2*outsize)
1907 requiredsize = 2*outsize;
1908 if (_PyUnicode_Resize(output, requiredsize) < 0)
1909 goto onError;
1910 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 }
1912 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001913 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 Py_UNICODE_COPY(*outptr, repptr, repsize);
1915 *outptr += repsize;
1916 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001917
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 /* we made it! */
1919 res = 0;
1920
Benjamin Peterson29060642009-01-31 22:14:21 +00001921 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 Py_XDECREF(restuple);
1923 return res;
1924}
1925
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926/* --- UTF-7 Codec -------------------------------------------------------- */
1927
Antoine Pitrou244651a2009-05-04 18:56:13 +00001928/* See RFC2152 for details. We encode conservatively and decode liberally. */
1929
1930/* Three simple macros defining base-64. */
1931
1932/* Is c a base-64 character? */
1933
1934#define IS_BASE64(c) \
1935 (((c) >= 'A' && (c) <= 'Z') || \
1936 ((c) >= 'a' && (c) <= 'z') || \
1937 ((c) >= '0' && (c) <= '9') || \
1938 (c) == '+' || (c) == '/')
1939
1940/* given that c is a base-64 character, what is its base-64 value? */
1941
1942#define FROM_BASE64(c) \
1943 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1944 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1945 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1946 (c) == '+' ? 62 : 63)
1947
1948/* What is the base-64 character of the bottom 6 bits of n? */
1949
1950#define TO_BASE64(n) \
1951 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1952
1953/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1954 * decoded as itself. We are permissive on decoding; the only ASCII
1955 * byte not decoding to itself is the + which begins a base64
1956 * string. */
1957
1958#define DECODE_DIRECT(c) \
1959 ((c) <= 127 && (c) != '+')
1960
1961/* The UTF-7 encoder treats ASCII characters differently according to
1962 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1963 * the above). See RFC2152. This array identifies these different
1964 * sets:
1965 * 0 : "Set D"
1966 * alphanumeric and '(),-./:?
1967 * 1 : "Set O"
1968 * !"#$%&*;<=>@[]^_`{|}
1969 * 2 : "whitespace"
1970 * ht nl cr sp
1971 * 3 : special (must be base64 encoded)
1972 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1973 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001974
Tim Petersced69f82003-09-16 20:30:58 +00001975static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001976char utf7_category[128] = {
1977/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1978 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1979/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1980 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1981/* sp ! " # $ % & ' ( ) * + , - . / */
1982 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1983/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1984 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1985/* @ A B C D E F G H I J K L M N O */
1986 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1987/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1989/* ` a b c d e f g h i j k l m n o */
1990 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991/* p q r s t u v w x y z { | } ~ del */
1992 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993};
1994
Antoine Pitrou244651a2009-05-04 18:56:13 +00001995/* ENCODE_DIRECT: this character should be encoded as itself. The
1996 * answer depends on whether we are encoding set O as itself, and also
1997 * on whether we are encoding whitespace as itself. RFC2152 makes it
1998 * clear that the answers to these questions vary between
1999 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002000
Antoine Pitrou244651a2009-05-04 18:56:13 +00002001#define ENCODE_DIRECT(c, directO, directWS) \
2002 ((c) < 128 && (c) > 0 && \
2003 ((utf7_category[(c)] == 0) || \
2004 (directWS && (utf7_category[(c)] == 2)) || \
2005 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002006
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002007PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002008 Py_ssize_t size,
2009 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002011 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2012}
2013
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014/* The decoder. The only state we preserve is our read position,
2015 * i.e. how many characters we have consumed. So if we end in the
2016 * middle of a shift sequence we have to back off the read position
2017 * and the output to the beginning of the sequence, otherwise we lose
2018 * all the shift state (seen bits, number of bits seen, high
2019 * surrogate). */
2020
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002021PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 Py_ssize_t size,
2023 const char *errors,
2024 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002025{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002027 Py_ssize_t startinpos;
2028 Py_ssize_t endinpos;
2029 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030 const char *e;
2031 PyUnicodeObject *unicode;
2032 Py_UNICODE *p;
2033 const char *errmsg = "";
2034 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002035 Py_UNICODE *shiftOutStart;
2036 unsigned int base64bits = 0;
2037 unsigned long base64buffer = 0;
2038 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 PyObject *errorHandler = NULL;
2040 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041
2042 unicode = _PyUnicode_New(size);
2043 if (!unicode)
2044 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002045 if (size == 0) {
2046 if (consumed)
2047 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002049 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002050
2051 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002052 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 e = s + size;
2054
2055 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002057 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002058 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002059
Antoine Pitrou244651a2009-05-04 18:56:13 +00002060 if (inShift) { /* in a base-64 section */
2061 if (IS_BASE64(ch)) { /* consume a base-64 character */
2062 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2063 base64bits += 6;
2064 s++;
2065 if (base64bits >= 16) {
2066 /* we have enough bits for a UTF-16 value */
2067 Py_UNICODE outCh = (Py_UNICODE)
2068 (base64buffer >> (base64bits-16));
2069 base64bits -= 16;
2070 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2071 if (surrogate) {
2072 /* expecting a second surrogate */
2073 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2074#ifdef Py_UNICODE_WIDE
2075 *p++ = (((surrogate & 0x3FF)<<10)
2076 | (outCh & 0x3FF)) + 0x10000;
2077#else
2078 *p++ = surrogate;
2079 *p++ = outCh;
2080#endif
2081 surrogate = 0;
2082 }
2083 else {
2084 surrogate = 0;
2085 errmsg = "second surrogate missing";
2086 goto utf7Error;
2087 }
2088 }
2089 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2090 /* first surrogate */
2091 surrogate = outCh;
2092 }
2093 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2094 errmsg = "unexpected second surrogate";
2095 goto utf7Error;
2096 }
2097 else {
2098 *p++ = outCh;
2099 }
2100 }
2101 }
2102 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002103 inShift = 0;
2104 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002105 if (surrogate) {
2106 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002107 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002108 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002109 if (base64bits > 0) { /* left-over bits */
2110 if (base64bits >= 6) {
2111 /* We've seen at least one base-64 character */
2112 errmsg = "partial character in shift sequence";
2113 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002115 else {
2116 /* Some bits remain; they should be zero */
2117 if (base64buffer != 0) {
2118 errmsg = "non-zero padding bits in shift sequence";
2119 goto utf7Error;
2120 }
2121 }
2122 }
2123 if (ch != '-') {
2124 /* '-' is absorbed; other terminating
2125 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002126 *p++ = ch;
2127 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128 }
2129 }
2130 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002132 s++; /* consume '+' */
2133 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002134 s++;
2135 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136 }
2137 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002138 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 shiftOutStart = p;
2140 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141 }
2142 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002143 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144 *p++ = ch;
2145 s++;
2146 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002147 else {
2148 startinpos = s-starts;
2149 s++;
2150 errmsg = "unexpected special character";
2151 goto utf7Error;
2152 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002153 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002154utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002155 outpos = p-PyUnicode_AS_UNICODE(unicode);
2156 endinpos = s-starts;
2157 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002158 errors, &errorHandler,
2159 "utf7", errmsg,
2160 &starts, &e, &startinpos, &endinpos, &exc, &s,
2161 &unicode, &outpos, &p))
2162 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
2164
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 /* end of string */
2166
2167 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2168 /* if we're in an inconsistent state, that's an error */
2169 if (surrogate ||
2170 (base64bits >= 6) ||
2171 (base64bits > 0 && base64buffer != 0)) {
2172 outpos = p-PyUnicode_AS_UNICODE(unicode);
2173 endinpos = size;
2174 if (unicode_decode_call_errorhandler(
2175 errors, &errorHandler,
2176 "utf7", "unterminated shift sequence",
2177 &starts, &e, &startinpos, &endinpos, &exc, &s,
2178 &unicode, &outpos, &p))
2179 goto onError;
2180 if (s < e)
2181 goto restart;
2182 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002183 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002184
2185 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002186 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002187 if (inShift) {
2188 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002189 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002190 }
2191 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002192 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002194 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002195
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002196 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002197 goto onError;
2198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 Py_XDECREF(errorHandler);
2200 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002201 return (PyObject *)unicode;
2202
Benjamin Peterson29060642009-01-31 22:14:21 +00002203 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 Py_XDECREF(errorHandler);
2205 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 Py_DECREF(unicode);
2207 return NULL;
2208}
2209
2210
2211PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002212 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 int base64SetO,
2214 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002215 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002216{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002217 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002219 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002221 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002222 unsigned int base64bits = 0;
2223 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002224 char * out;
2225 char * start;
2226
2227 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002228 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002229
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002230 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002231 return PyErr_NoMemory();
2232
Antoine Pitrou244651a2009-05-04 18:56:13 +00002233 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002234 if (v == NULL)
2235 return NULL;
2236
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002237 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238 for (;i < size; ++i) {
2239 Py_UNICODE ch = s[i];
2240
Antoine Pitrou244651a2009-05-04 18:56:13 +00002241 if (inShift) {
2242 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2243 /* shifting out */
2244 if (base64bits) { /* output remaining bits */
2245 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2246 base64buffer = 0;
2247 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 }
2249 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002250 /* Characters not in the BASE64 set implicitly unshift the sequence
2251 so no '-' is required, except if the character is itself a '-' */
2252 if (IS_BASE64(ch) || ch == '-') {
2253 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002254 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 *out++ = (char) ch;
2256 }
2257 else {
2258 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002259 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002260 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002261 else { /* not in a shift sequence */
2262 if (ch == '+') {
2263 *out++ = '+';
2264 *out++ = '-';
2265 }
2266 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2267 *out++ = (char) ch;
2268 }
2269 else {
2270 *out++ = '+';
2271 inShift = 1;
2272 goto encode_char;
2273 }
2274 }
2275 continue;
2276encode_char:
2277#ifdef Py_UNICODE_WIDE
2278 if (ch >= 0x10000) {
2279 /* code first surrogate */
2280 base64bits += 16;
2281 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2282 while (base64bits >= 6) {
2283 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2284 base64bits -= 6;
2285 }
2286 /* prepare second surrogate */
2287 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2288 }
2289#endif
2290 base64bits += 16;
2291 base64buffer = (base64buffer << 16) | ch;
2292 while (base64bits >= 6) {
2293 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2294 base64bits -= 6;
2295 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002297 if (base64bits)
2298 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2299 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002300 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002301 if (_PyBytes_Resize(&v, out - start) < 0)
2302 return NULL;
2303 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002304}
2305
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306#undef IS_BASE64
2307#undef FROM_BASE64
2308#undef TO_BASE64
2309#undef DECODE_DIRECT
2310#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002311
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312/* --- UTF-8 Codec -------------------------------------------------------- */
2313
Tim Petersced69f82003-09-16 20:30:58 +00002314static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002316 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2317 illegal prefix. See RFC 3629 for details */
2318 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2319 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002320 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2322 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2323 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2324 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002325 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2330 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2331 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2332 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2333 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334};
2335
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002337 Py_ssize_t size,
2338 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339{
Walter Dörwald69652032004-09-07 20:24:22 +00002340 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2341}
2342
Antoine Pitrouab868312009-01-10 15:40:25 +00002343/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2344#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2345
2346/* Mask to quickly check whether a C 'long' contains a
2347 non-ASCII, UTF8-encoded char. */
2348#if (SIZEOF_LONG == 8)
2349# define ASCII_CHAR_MASK 0x8080808080808080L
2350#elif (SIZEOF_LONG == 4)
2351# define ASCII_CHAR_MASK 0x80808080L
2352#else
2353# error C 'long' size should be either 4 or 8!
2354#endif
2355
Walter Dörwald69652032004-09-07 20:24:22 +00002356PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002357 Py_ssize_t size,
2358 const char *errors,
2359 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002360{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002363 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002364 Py_ssize_t startinpos;
2365 Py_ssize_t endinpos;
2366 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002367 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 PyUnicodeObject *unicode;
2369 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002370 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 PyObject *errorHandler = NULL;
2372 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 /* Note: size will always be longer than the resulting Unicode
2375 character count */
2376 unicode = _PyUnicode_New(size);
2377 if (!unicode)
2378 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002379 if (size == 0) {
2380 if (consumed)
2381 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002383 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
2385 /* Unpack UTF-8 encoded data */
2386 p = unicode->str;
2387 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002388 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
2390 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002391 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392
2393 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002394 /* Fast path for runs of ASCII characters. Given that common UTF-8
2395 input will consist of an overwhelming majority of ASCII
2396 characters, we try to optimize for this case by checking
2397 as many characters as a C 'long' can contain.
2398 First, check if we can do an aligned read, as most CPUs have
2399 a penalty for unaligned reads.
2400 */
2401 if (!((size_t) s & LONG_PTR_MASK)) {
2402 /* Help register allocation */
2403 register const char *_s = s;
2404 register Py_UNICODE *_p = p;
2405 while (_s < aligned_end) {
2406 /* Read a whole long at a time (either 4 or 8 bytes),
2407 and do a fast unrolled copy if it only contains ASCII
2408 characters. */
2409 unsigned long data = *(unsigned long *) _s;
2410 if (data & ASCII_CHAR_MASK)
2411 break;
2412 _p[0] = (unsigned char) _s[0];
2413 _p[1] = (unsigned char) _s[1];
2414 _p[2] = (unsigned char) _s[2];
2415 _p[3] = (unsigned char) _s[3];
2416#if (SIZEOF_LONG == 8)
2417 _p[4] = (unsigned char) _s[4];
2418 _p[5] = (unsigned char) _s[5];
2419 _p[6] = (unsigned char) _s[6];
2420 _p[7] = (unsigned char) _s[7];
2421#endif
2422 _s += SIZEOF_LONG;
2423 _p += SIZEOF_LONG;
2424 }
2425 s = _s;
2426 p = _p;
2427 if (s == e)
2428 break;
2429 ch = (unsigned char)*s;
2430 }
2431 }
2432
2433 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002434 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 s++;
2436 continue;
2437 }
2438
2439 n = utf8_code_length[ch];
2440
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002441 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002442 if (consumed)
2443 break;
2444 else {
2445 errmsg = "unexpected end of data";
2446 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002447 endinpos = startinpos+1;
2448 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2449 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 goto utf8Error;
2451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453
2454 switch (n) {
2455
2456 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002457 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 startinpos = s-starts;
2459 endinpos = startinpos+1;
2460 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461
2462 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002463 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 startinpos = s-starts;
2465 endinpos = startinpos+1;
2466 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467
2468 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002469 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002470 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002471 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002472 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002473 goto utf8Error;
2474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002476 assert ((ch > 0x007F) && (ch <= 0x07FF));
2477 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 break;
2479
2480 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002481 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2482 will result in surrogates in range d800-dfff. Surrogates are
2483 not valid UTF-8 so they are rejected.
2484 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2485 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002486 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002487 (s[2] & 0xc0) != 0x80 ||
2488 ((unsigned char)s[0] == 0xE0 &&
2489 (unsigned char)s[1] < 0xA0) ||
2490 ((unsigned char)s[0] == 0xED &&
2491 (unsigned char)s[1] > 0x9F)) {
2492 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002494 endinpos = startinpos + 1;
2495
2496 /* if s[1] first two bits are 1 and 0, then the invalid
2497 continuation byte is s[2], so increment endinpos by 1,
2498 if not, s[1] is invalid and endinpos doesn't need to
2499 be incremented. */
2500 if ((s[1] & 0xC0) == 0x80)
2501 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002502 goto utf8Error;
2503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002505 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2506 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002507 break;
2508
2509 case 4:
2510 if ((s[1] & 0xc0) != 0x80 ||
2511 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002512 (s[3] & 0xc0) != 0x80 ||
2513 ((unsigned char)s[0] == 0xF0 &&
2514 (unsigned char)s[1] < 0x90) ||
2515 ((unsigned char)s[0] == 0xF4 &&
2516 (unsigned char)s[1] > 0x8F)) {
2517 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002519 endinpos = startinpos + 1;
2520 if ((s[1] & 0xC0) == 0x80) {
2521 endinpos++;
2522 if ((s[2] & 0xC0) == 0x80)
2523 endinpos++;
2524 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 goto utf8Error;
2526 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002527 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002528 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2529 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2530
Fredrik Lundh8f455852001-06-27 18:59:43 +00002531#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002532 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002533#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002534 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002535
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002536 /* translate from 10000..10FFFF to 0..FFFF */
2537 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002538
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002539 /* high surrogate = top 10 bits added to D800 */
2540 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002541
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002542 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002543 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002544#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 }
2547 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002549
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 utf8Error:
2551 outpos = p-PyUnicode_AS_UNICODE(unicode);
2552 if (unicode_decode_call_errorhandler(
2553 errors, &errorHandler,
2554 "utf8", errmsg,
2555 &starts, &e, &startinpos, &endinpos, &exc, &s,
2556 &unicode, &outpos, &p))
2557 goto onError;
2558 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 }
Walter Dörwald69652032004-09-07 20:24:22 +00002560 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002561 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562
2563 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002564 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 goto onError;
2566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 Py_XDECREF(errorHandler);
2568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 return (PyObject *)unicode;
2570
Benjamin Peterson29060642009-01-31 22:14:21 +00002571 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 Py_XDECREF(errorHandler);
2573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 Py_DECREF(unicode);
2575 return NULL;
2576}
2577
Antoine Pitrouab868312009-01-10 15:40:25 +00002578#undef ASCII_CHAR_MASK
2579
2580
Tim Peters602f7402002-04-27 18:03:26 +00002581/* Allocation strategy: if the string is short, convert into a stack buffer
2582 and allocate exactly as much space needed at the end. Else allocate the
2583 maximum possible needed (4 result bytes per Unicode character), and return
2584 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002585*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002586PyObject *
2587PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002588 Py_ssize_t size,
2589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590{
Tim Peters602f7402002-04-27 18:03:26 +00002591#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002592
Guido van Rossum98297ee2007-11-06 21:34:58 +00002593 Py_ssize_t i; /* index into s of next input byte */
2594 PyObject *result; /* result string object */
2595 char *p; /* next free byte in output buffer */
2596 Py_ssize_t nallocated; /* number of result bytes allocated */
2597 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002598 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002599 PyObject *errorHandler = NULL;
2600 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002601
Tim Peters602f7402002-04-27 18:03:26 +00002602 assert(s != NULL);
2603 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
Tim Peters602f7402002-04-27 18:03:26 +00002605 if (size <= MAX_SHORT_UNICHARS) {
2606 /* Write into the stack buffer; nallocated can't overflow.
2607 * At the end, we'll allocate exactly as much heap space as it
2608 * turns out we need.
2609 */
2610 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002611 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002612 p = stackbuf;
2613 }
2614 else {
2615 /* Overallocate on the heap, and give the excess back at the end. */
2616 nallocated = size * 4;
2617 if (nallocated / 4 != size) /* overflow! */
2618 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002619 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002620 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002621 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002622 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002623 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002624
Tim Peters602f7402002-04-27 18:03:26 +00002625 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002626 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002627
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002628 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002629 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002631
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002633 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002634 *p++ = (char)(0xc0 | (ch >> 6));
2635 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002636 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002637#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002638 /* Special case: check for high and low surrogate */
2639 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2640 Py_UCS4 ch2 = s[i];
2641 /* Combine the two surrogates to form a UCS4 value */
2642 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2643 i++;
2644
2645 /* Encode UCS4 Unicode ordinals */
2646 *p++ = (char)(0xf0 | (ch >> 18));
2647 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002648 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2649 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002650 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002651#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002652 Py_ssize_t newpos;
2653 PyObject *rep;
2654 Py_ssize_t repsize, k;
2655 rep = unicode_encode_call_errorhandler
2656 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2657 s, size, &exc, i-1, i, &newpos);
2658 if (!rep)
2659 goto error;
2660
2661 if (PyBytes_Check(rep))
2662 repsize = PyBytes_GET_SIZE(rep);
2663 else
2664 repsize = PyUnicode_GET_SIZE(rep);
2665
2666 if (repsize > 4) {
2667 Py_ssize_t offset;
2668
2669 if (result == NULL)
2670 offset = p - stackbuf;
2671 else
2672 offset = p - PyBytes_AS_STRING(result);
2673
2674 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2675 /* integer overflow */
2676 PyErr_NoMemory();
2677 goto error;
2678 }
2679 nallocated += repsize - 4;
2680 if (result != NULL) {
2681 if (_PyBytes_Resize(&result, nallocated) < 0)
2682 goto error;
2683 } else {
2684 result = PyBytes_FromStringAndSize(NULL, nallocated);
2685 if (result == NULL)
2686 goto error;
2687 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2688 }
2689 p = PyBytes_AS_STRING(result) + offset;
2690 }
2691
2692 if (PyBytes_Check(rep)) {
2693 char *prep = PyBytes_AS_STRING(rep);
2694 for(k = repsize; k > 0; k--)
2695 *p++ = *prep++;
2696 } else /* rep is unicode */ {
2697 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2698 Py_UNICODE c;
2699
2700 for(k=0; k<repsize; k++) {
2701 c = prep[k];
2702 if (0x80 <= c) {
2703 raise_encode_exception(&exc, "utf-8", s, size,
2704 i-1, i, "surrogates not allowed");
2705 goto error;
2706 }
2707 *p++ = (char)prep[k];
2708 }
2709 }
2710 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002711#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002712 }
Victor Stinner445a6232010-04-22 20:01:57 +00002713#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002714 } else if (ch < 0x10000) {
2715 *p++ = (char)(0xe0 | (ch >> 12));
2716 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2717 *p++ = (char)(0x80 | (ch & 0x3f));
2718 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002719 /* Encode UCS4 Unicode ordinals */
2720 *p++ = (char)(0xf0 | (ch >> 18));
2721 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2722 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2723 *p++ = (char)(0x80 | (ch & 0x3f));
2724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002726
Guido van Rossum98297ee2007-11-06 21:34:58 +00002727 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002728 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002729 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002730 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002731 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002732 }
2733 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002734 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002735 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002736 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002737 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002738 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002739 Py_XDECREF(errorHandler);
2740 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002741 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002742 error:
2743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
2745 Py_XDECREF(result);
2746 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002747
Tim Peters602f7402002-04-27 18:03:26 +00002748#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749}
2750
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 if (!PyUnicode_Check(unicode)) {
2754 PyErr_BadArgument();
2755 return NULL;
2756 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002757 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002758 PyUnicode_GET_SIZE(unicode),
2759 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760}
2761
Walter Dörwald41980ca2007-08-16 21:55:45 +00002762/* --- UTF-32 Codec ------------------------------------------------------- */
2763
2764PyObject *
2765PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 Py_ssize_t size,
2767 const char *errors,
2768 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002769{
2770 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2771}
2772
2773PyObject *
2774PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 Py_ssize_t size,
2776 const char *errors,
2777 int *byteorder,
2778 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779{
2780 const char *starts = s;
2781 Py_ssize_t startinpos;
2782 Py_ssize_t endinpos;
2783 Py_ssize_t outpos;
2784 PyUnicodeObject *unicode;
2785 Py_UNICODE *p;
2786#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002787 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002788 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002789#else
2790 const int pairs = 0;
2791#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002792 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002793 int bo = 0; /* assume native ordering by default */
2794 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002795 /* Offsets from q for retrieving bytes in the right order. */
2796#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2797 int iorder[] = {0, 1, 2, 3};
2798#else
2799 int iorder[] = {3, 2, 1, 0};
2800#endif
2801 PyObject *errorHandler = NULL;
2802 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002803
Walter Dörwald41980ca2007-08-16 21:55:45 +00002804 q = (unsigned char *)s;
2805 e = q + size;
2806
2807 if (byteorder)
2808 bo = *byteorder;
2809
2810 /* Check for BOM marks (U+FEFF) in the input and adjust current
2811 byte order setting accordingly. In native mode, the leading BOM
2812 mark is skipped, in all other modes, it is copied to the output
2813 stream as-is (giving a ZWNBSP character). */
2814 if (bo == 0) {
2815 if (size >= 4) {
2816 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 if (bom == 0x0000FEFF) {
2820 q += 4;
2821 bo = -1;
2822 }
2823 else if (bom == 0xFFFE0000) {
2824 q += 4;
2825 bo = 1;
2826 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002827#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 if (bom == 0x0000FEFF) {
2829 q += 4;
2830 bo = 1;
2831 }
2832 else if (bom == 0xFFFE0000) {
2833 q += 4;
2834 bo = -1;
2835 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002836#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002838 }
2839
2840 if (bo == -1) {
2841 /* force LE */
2842 iorder[0] = 0;
2843 iorder[1] = 1;
2844 iorder[2] = 2;
2845 iorder[3] = 3;
2846 }
2847 else if (bo == 1) {
2848 /* force BE */
2849 iorder[0] = 3;
2850 iorder[1] = 2;
2851 iorder[2] = 1;
2852 iorder[3] = 0;
2853 }
2854
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002855 /* On narrow builds we split characters outside the BMP into two
2856 codepoints => count how much extra space we need. */
2857#ifndef Py_UNICODE_WIDE
2858 for (qq = q; qq < e; qq += 4)
2859 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2860 pairs++;
2861#endif
2862
2863 /* This might be one to much, because of a BOM */
2864 unicode = _PyUnicode_New((size+3)/4+pairs);
2865 if (!unicode)
2866 return NULL;
2867 if (size == 0)
2868 return (PyObject *)unicode;
2869
2870 /* Unpack UTF-32 encoded data */
2871 p = unicode->str;
2872
Walter Dörwald41980ca2007-08-16 21:55:45 +00002873 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 Py_UCS4 ch;
2875 /* remaining bytes at the end? (size should be divisible by 4) */
2876 if (e-q<4) {
2877 if (consumed)
2878 break;
2879 errmsg = "truncated data";
2880 startinpos = ((const char *)q)-starts;
2881 endinpos = ((const char *)e)-starts;
2882 goto utf32Error;
2883 /* The remaining input chars are ignored if the callback
2884 chooses to skip the input */
2885 }
2886 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2887 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002888
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 if (ch >= 0x110000)
2890 {
2891 errmsg = "codepoint not in range(0x110000)";
2892 startinpos = ((const char *)q)-starts;
2893 endinpos = startinpos+4;
2894 goto utf32Error;
2895 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002896#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 if (ch >= 0x10000)
2898 {
2899 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2900 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2901 }
2902 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002903#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 *p++ = ch;
2905 q += 4;
2906 continue;
2907 utf32Error:
2908 outpos = p-PyUnicode_AS_UNICODE(unicode);
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "utf32", errmsg,
2912 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2913 &unicode, &outpos, &p))
2914 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002915 }
2916
2917 if (byteorder)
2918 *byteorder = bo;
2919
2920 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002922
2923 /* Adjust length */
2924 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2925 goto onError;
2926
2927 Py_XDECREF(errorHandler);
2928 Py_XDECREF(exc);
2929 return (PyObject *)unicode;
2930
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932 Py_DECREF(unicode);
2933 Py_XDECREF(errorHandler);
2934 Py_XDECREF(exc);
2935 return NULL;
2936}
2937
2938PyObject *
2939PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002940 Py_ssize_t size,
2941 const char *errors,
2942 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002943{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002944 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002945 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002946 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002947#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002948 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002949#else
2950 const int pairs = 0;
2951#endif
2952 /* Offsets from p for storing byte pairs in the right order. */
2953#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2954 int iorder[] = {0, 1, 2, 3};
2955#else
2956 int iorder[] = {3, 2, 1, 0};
2957#endif
2958
Benjamin Peterson29060642009-01-31 22:14:21 +00002959#define STORECHAR(CH) \
2960 do { \
2961 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2962 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2963 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2964 p[iorder[0]] = (CH) & 0xff; \
2965 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002966 } while(0)
2967
2968 /* In narrow builds we can output surrogate pairs as one codepoint,
2969 so we need less space. */
2970#ifndef Py_UNICODE_WIDE
2971 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2973 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2974 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002976 nsize = (size - pairs + (byteorder == 0));
2977 bytesize = nsize * 4;
2978 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002980 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002981 if (v == NULL)
2982 return NULL;
2983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002984 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002985 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002987 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002988 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002989
2990 if (byteorder == -1) {
2991 /* force LE */
2992 iorder[0] = 0;
2993 iorder[1] = 1;
2994 iorder[2] = 2;
2995 iorder[3] = 3;
2996 }
2997 else if (byteorder == 1) {
2998 /* force BE */
2999 iorder[0] = 3;
3000 iorder[1] = 2;
3001 iorder[2] = 1;
3002 iorder[3] = 0;
3003 }
3004
3005 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003006 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003007#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3009 Py_UCS4 ch2 = *s;
3010 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3011 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3012 s++;
3013 size--;
3014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003015 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003016#endif
3017 STORECHAR(ch);
3018 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003019
3020 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003021 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003022#undef STORECHAR
3023}
3024
3025PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3026{
3027 if (!PyUnicode_Check(unicode)) {
3028 PyErr_BadArgument();
3029 return NULL;
3030 }
3031 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyUnicode_GET_SIZE(unicode),
3033 NULL,
3034 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003035}
3036
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037/* --- UTF-16 Codec ------------------------------------------------------- */
3038
Tim Peters772747b2001-08-09 22:21:55 +00003039PyObject *
3040PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 Py_ssize_t size,
3042 const char *errors,
3043 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044{
Walter Dörwald69652032004-09-07 20:24:22 +00003045 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3046}
3047
Antoine Pitrouab868312009-01-10 15:40:25 +00003048/* Two masks for fast checking of whether a C 'long' may contain
3049 UTF16-encoded surrogate characters. This is an efficient heuristic,
3050 assuming that non-surrogate characters with a code point >= 0x8000 are
3051 rare in most input.
3052 FAST_CHAR_MASK is used when the input is in native byte ordering,
3053 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003054*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003055#if (SIZEOF_LONG == 8)
3056# define FAST_CHAR_MASK 0x8000800080008000L
3057# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3058#elif (SIZEOF_LONG == 4)
3059# define FAST_CHAR_MASK 0x80008000L
3060# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3061#else
3062# error C 'long' size should be either 4 or 8!
3063#endif
3064
Walter Dörwald69652032004-09-07 20:24:22 +00003065PyObject *
3066PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 Py_ssize_t size,
3068 const char *errors,
3069 int *byteorder,
3070 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003071{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003073 Py_ssize_t startinpos;
3074 Py_ssize_t endinpos;
3075 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 PyUnicodeObject *unicode;
3077 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003078 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003079 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003080 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003081 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003082 /* Offsets from q for retrieving byte pairs in the right order. */
3083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3084 int ihi = 1, ilo = 0;
3085#else
3086 int ihi = 0, ilo = 1;
3087#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 PyObject *errorHandler = NULL;
3089 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090
3091 /* Note: size will always be longer than the resulting Unicode
3092 character count */
3093 unicode = _PyUnicode_New(size);
3094 if (!unicode)
3095 return NULL;
3096 if (size == 0)
3097 return (PyObject *)unicode;
3098
3099 /* Unpack UTF-16 encoded data */
3100 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003101 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003102 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103
3104 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003105 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003107 /* Check for BOM marks (U+FEFF) in the input and adjust current
3108 byte order setting accordingly. In native mode, the leading BOM
3109 mark is skipped, in all other modes, it is copied to the output
3110 stream as-is (giving a ZWNBSP character). */
3111 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003112 if (size >= 2) {
3113 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003114#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003115 if (bom == 0xFEFF) {
3116 q += 2;
3117 bo = -1;
3118 }
3119 else if (bom == 0xFFFE) {
3120 q += 2;
3121 bo = 1;
3122 }
Tim Petersced69f82003-09-16 20:30:58 +00003123#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003124 if (bom == 0xFEFF) {
3125 q += 2;
3126 bo = 1;
3127 }
3128 else if (bom == 0xFFFE) {
3129 q += 2;
3130 bo = -1;
3131 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003132#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135
Tim Peters772747b2001-08-09 22:21:55 +00003136 if (bo == -1) {
3137 /* force LE */
3138 ihi = 1;
3139 ilo = 0;
3140 }
3141 else if (bo == 1) {
3142 /* force BE */
3143 ihi = 0;
3144 ilo = 1;
3145 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3147 native_ordering = ilo < ihi;
3148#else
3149 native_ordering = ilo > ihi;
3150#endif
Tim Peters772747b2001-08-09 22:21:55 +00003151
Antoine Pitrouab868312009-01-10 15:40:25 +00003152 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003153 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003155 /* First check for possible aligned read of a C 'long'. Unaligned
3156 reads are more expensive, better to defer to another iteration. */
3157 if (!((size_t) q & LONG_PTR_MASK)) {
3158 /* Fast path for runs of non-surrogate chars. */
3159 register const unsigned char *_q = q;
3160 Py_UNICODE *_p = p;
3161 if (native_ordering) {
3162 /* Native ordering is simple: as long as the input cannot
3163 possibly contain a surrogate char, do an unrolled copy
3164 of several 16-bit code points to the target object.
3165 The non-surrogate check is done on several input bytes
3166 at a time (as many as a C 'long' can contain). */
3167 while (_q < aligned_end) {
3168 unsigned long data = * (unsigned long *) _q;
3169 if (data & FAST_CHAR_MASK)
3170 break;
3171 _p[0] = ((unsigned short *) _q)[0];
3172 _p[1] = ((unsigned short *) _q)[1];
3173#if (SIZEOF_LONG == 8)
3174 _p[2] = ((unsigned short *) _q)[2];
3175 _p[3] = ((unsigned short *) _q)[3];
3176#endif
3177 _q += SIZEOF_LONG;
3178 _p += SIZEOF_LONG / 2;
3179 }
3180 }
3181 else {
3182 /* Byteswapped ordering is similar, but we must decompose
3183 the copy bytewise, and take care of zero'ing out the
3184 upper bytes if the target object is in 32-bit units
3185 (that is, in UCS-4 builds). */
3186 while (_q < aligned_end) {
3187 unsigned long data = * (unsigned long *) _q;
3188 if (data & SWAPPED_FAST_CHAR_MASK)
3189 break;
3190 /* Zero upper bytes in UCS-4 builds */
3191#if (Py_UNICODE_SIZE > 2)
3192 _p[0] = 0;
3193 _p[1] = 0;
3194#if (SIZEOF_LONG == 8)
3195 _p[2] = 0;
3196 _p[3] = 0;
3197#endif
3198#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003199 /* Issue #4916; UCS-4 builds on big endian machines must
3200 fill the two last bytes of each 4-byte unit. */
3201#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3202# define OFF 2
3203#else
3204# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003205#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003206 ((unsigned char *) _p)[OFF + 1] = _q[0];
3207 ((unsigned char *) _p)[OFF + 0] = _q[1];
3208 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3209 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3210#if (SIZEOF_LONG == 8)
3211 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3212 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3213 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3214 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3215#endif
3216#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003217 _q += SIZEOF_LONG;
3218 _p += SIZEOF_LONG / 2;
3219 }
3220 }
3221 p = _p;
3222 q = _q;
3223 if (q >= e)
3224 break;
3225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227
Benjamin Peterson14339b62009-01-31 16:36:08 +00003228 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003229
3230 if (ch < 0xD800 || ch > 0xDFFF) {
3231 *p++ = ch;
3232 continue;
3233 }
3234
3235 /* UTF-16 code pair: */
3236 if (q > e) {
3237 errmsg = "unexpected end of data";
3238 startinpos = (((const char *)q) - 2) - starts;
3239 endinpos = ((const char *)e) + 1 - starts;
3240 goto utf16Error;
3241 }
3242 if (0xD800 <= ch && ch <= 0xDBFF) {
3243 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3244 q += 2;
3245 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003246#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 *p++ = ch;
3248 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003249#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003251#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 continue;
3253 }
3254 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003255 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 startinpos = (((const char *)q)-4)-starts;
3257 endinpos = startinpos+2;
3258 goto utf16Error;
3259 }
3260
Benjamin Peterson14339b62009-01-31 16:36:08 +00003261 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 errmsg = "illegal encoding";
3263 startinpos = (((const char *)q)-2)-starts;
3264 endinpos = startinpos+2;
3265 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 utf16Error:
3268 outpos = p - PyUnicode_AS_UNICODE(unicode);
3269 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003270 errors,
3271 &errorHandler,
3272 "utf16", errmsg,
3273 &starts,
3274 (const char **)&e,
3275 &startinpos,
3276 &endinpos,
3277 &exc,
3278 (const char **)&q,
3279 &unicode,
3280 &outpos,
3281 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003284 /* remaining byte at the end? (size should be even) */
3285 if (e == q) {
3286 if (!consumed) {
3287 errmsg = "truncated data";
3288 startinpos = ((const char *)q) - starts;
3289 endinpos = ((const char *)e) + 1 - starts;
3290 outpos = p - PyUnicode_AS_UNICODE(unicode);
3291 if (unicode_decode_call_errorhandler(
3292 errors,
3293 &errorHandler,
3294 "utf16", errmsg,
3295 &starts,
3296 (const char **)&e,
3297 &startinpos,
3298 &endinpos,
3299 &exc,
3300 (const char **)&q,
3301 &unicode,
3302 &outpos,
3303 &p))
3304 goto onError;
3305 /* The remaining input chars are ignored if the callback
3306 chooses to skip the input */
3307 }
3308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 if (byteorder)
3311 *byteorder = bo;
3312
Walter Dörwald69652032004-09-07 20:24:22 +00003313 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003315
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003317 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 goto onError;
3319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 Py_XDECREF(errorHandler);
3321 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 return (PyObject *)unicode;
3323
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 Py_XDECREF(errorHandler);
3327 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 return NULL;
3329}
3330
Antoine Pitrouab868312009-01-10 15:40:25 +00003331#undef FAST_CHAR_MASK
3332#undef SWAPPED_FAST_CHAR_MASK
3333
Tim Peters772747b2001-08-09 22:21:55 +00003334PyObject *
3335PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 Py_ssize_t size,
3337 const char *errors,
3338 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003340 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003341 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003342 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003343#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003344 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003345#else
3346 const int pairs = 0;
3347#endif
Tim Peters772747b2001-08-09 22:21:55 +00003348 /* Offsets from p for storing byte pairs in the right order. */
3349#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3350 int ihi = 1, ilo = 0;
3351#else
3352 int ihi = 0, ilo = 1;
3353#endif
3354
Benjamin Peterson29060642009-01-31 22:14:21 +00003355#define STORECHAR(CH) \
3356 do { \
3357 p[ihi] = ((CH) >> 8) & 0xff; \
3358 p[ilo] = (CH) & 0xff; \
3359 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003360 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003362#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003363 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 if (s[i] >= 0x10000)
3365 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003366#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003367 /* 2 * (size + pairs + (byteorder == 0)) */
3368 if (size > PY_SSIZE_T_MAX ||
3369 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003371 nsize = size + pairs + (byteorder == 0);
3372 bytesize = nsize * 2;
3373 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003375 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 if (v == NULL)
3377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003379 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003381 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003382 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003383 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003384
3385 if (byteorder == -1) {
3386 /* force LE */
3387 ihi = 1;
3388 ilo = 0;
3389 }
3390 else if (byteorder == 1) {
3391 /* force BE */
3392 ihi = 0;
3393 ilo = 1;
3394 }
3395
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003396 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003397 Py_UNICODE ch = *s++;
3398 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003399#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 if (ch >= 0x10000) {
3401 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3402 ch = 0xD800 | ((ch-0x10000) >> 10);
3403 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003404#endif
Tim Peters772747b2001-08-09 22:21:55 +00003405 STORECHAR(ch);
3406 if (ch2)
3407 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003408 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003409
3410 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003411 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003412#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413}
3414
3415PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3416{
3417 if (!PyUnicode_Check(unicode)) {
3418 PyErr_BadArgument();
3419 return NULL;
3420 }
3421 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 PyUnicode_GET_SIZE(unicode),
3423 NULL,
3424 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425}
3426
3427/* --- Unicode Escape Codec ----------------------------------------------- */
3428
Fredrik Lundh06d12682001-01-24 07:59:11 +00003429static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003430
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 Py_ssize_t size,
3433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t startinpos;
3437 Py_ssize_t endinpos;
3438 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003443 char* message;
3444 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 PyObject *errorHandler = NULL;
3446 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 /* Escaped strings will always be longer than the resulting
3449 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 length after conversion to the true value.
3451 (but if the error callback returns a long replacement string
3452 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 v = _PyUnicode_New(size);
3454 if (v == NULL)
3455 goto onError;
3456 if (size == 0)
3457 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003461
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 while (s < end) {
3463 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003464 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466
3467 /* Non-escape characters are interpreted as Unicode ordinals */
3468 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003469 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 continue;
3471 }
3472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 /* \ - Escapes */
3475 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003476 c = *s++;
3477 if (s > end)
3478 c = '\0'; /* Invalid after \ */
3479 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 case '\n': break;
3483 case '\\': *p++ = '\\'; break;
3484 case '\'': *p++ = '\''; break;
3485 case '\"': *p++ = '\"'; break;
3486 case 'b': *p++ = '\b'; break;
3487 case 'f': *p++ = '\014'; break; /* FF */
3488 case 't': *p++ = '\t'; break;
3489 case 'n': *p++ = '\n'; break;
3490 case 'r': *p++ = '\r'; break;
3491 case 'v': *p++ = '\013'; break; /* VT */
3492 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3493
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 case '0': case '1': case '2': case '3':
3496 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003497 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003498 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003499 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003500 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003501 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003503 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 break;
3505
Benjamin Peterson29060642009-01-31 22:14:21 +00003506 /* hex escapes */
3507 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003509 digits = 2;
3510 message = "truncated \\xXX escape";
3511 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003515 digits = 4;
3516 message = "truncated \\uXXXX escape";
3517 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003520 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003521 digits = 8;
3522 message = "truncated \\UXXXXXXXX escape";
3523 hexescape:
3524 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 outpos = p-PyUnicode_AS_UNICODE(v);
3526 if (s+digits>end) {
3527 endinpos = size;
3528 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 errors, &errorHandler,
3530 "unicodeescape", "end of string in escape sequence",
3531 &starts, &end, &startinpos, &endinpos, &exc, &s,
3532 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 goto onError;
3534 goto nextByte;
3535 }
3536 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003537 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003538 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 endinpos = (s+i+1)-starts;
3540 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 errors, &errorHandler,
3542 "unicodeescape", message,
3543 &starts, &end, &startinpos, &endinpos, &exc, &s,
3544 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003545 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003547 }
3548 chr = (chr<<4) & ~0xF;
3549 if (c >= '0' && c <= '9')
3550 chr += c - '0';
3551 else if (c >= 'a' && c <= 'f')
3552 chr += 10 + c - 'a';
3553 else
3554 chr += 10 + c - 'A';
3555 }
3556 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003557 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 /* _decoding_error will have already written into the
3559 target buffer. */
3560 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003561 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003562 /* when we get here, chr is a 32-bit unicode character */
3563 if (chr <= 0xffff)
3564 /* UCS-2 character */
3565 *p++ = (Py_UNICODE) chr;
3566 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003567 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003568 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003569#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003570 *p++ = chr;
3571#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003572 chr -= 0x10000L;
3573 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003574 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003575#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003576 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 endinpos = s-starts;
3578 outpos = p-PyUnicode_AS_UNICODE(v);
3579 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 errors, &errorHandler,
3581 "unicodeescape", "illegal Unicode character",
3582 &starts, &end, &startinpos, &endinpos, &exc, &s,
3583 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003584 goto onError;
3585 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003586 break;
3587
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003589 case 'N':
3590 message = "malformed \\N character escape";
3591 if (ucnhash_CAPI == NULL) {
3592 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003593 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003594 if (ucnhash_CAPI == NULL)
3595 goto ucnhashError;
3596 }
3597 if (*s == '{') {
3598 const char *start = s+1;
3599 /* look for the closing brace */
3600 while (*s != '}' && s < end)
3601 s++;
3602 if (s > start && s < end && *s == '}') {
3603 /* found a name. look it up in the unicode database */
3604 message = "unknown Unicode character name";
3605 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003606 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003607 goto store;
3608 }
3609 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 endinpos = s-starts;
3611 outpos = p-PyUnicode_AS_UNICODE(v);
3612 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 errors, &errorHandler,
3614 "unicodeescape", message,
3615 &starts, &end, &startinpos, &endinpos, &exc, &s,
3616 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003617 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003618 break;
3619
3620 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003621 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 message = "\\ at end of string";
3623 s--;
3624 endinpos = s-starts;
3625 outpos = p-PyUnicode_AS_UNICODE(v);
3626 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 errors, &errorHandler,
3628 "unicodeescape", message,
3629 &starts, &end, &startinpos, &endinpos, &exc, &s,
3630 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003631 goto onError;
3632 }
3633 else {
3634 *p++ = '\\';
3635 *p++ = (unsigned char)s[-1];
3636 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003637 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003642 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003644 Py_XDECREF(errorHandler);
3645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003647
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003649 PyErr_SetString(
3650 PyExc_UnicodeError,
3651 "\\N escapes not supported (can't load unicodedata module)"
3652 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003653 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_XDECREF(errorHandler);
3655 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003656 return NULL;
3657
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return NULL;
3663}
3664
3665/* Return a Unicode-Escape string version of the Unicode object.
3666
3667 If quotes is true, the string is enclosed in u"" or u'' quotes as
3668 appropriate.
3669
3670*/
3671
Thomas Wouters477c8d52006-05-27 19:21:47 +00003672Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 Py_ssize_t size,
3674 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003675{
3676 /* like wcschr, but doesn't stop at NULL characters */
3677
3678 while (size-- > 0) {
3679 if (*s == ch)
3680 return s;
3681 s++;
3682 }
3683
3684 return NULL;
3685}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003686
Walter Dörwald79e913e2007-05-12 11:08:06 +00003687static const char *hexdigits = "0123456789abcdef";
3688
3689PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003692 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695#ifdef Py_UNICODE_WIDE
3696 const Py_ssize_t expandsize = 10;
3697#else
3698 const Py_ssize_t expandsize = 6;
3699#endif
3700
Thomas Wouters89f507f2006-12-13 04:49:30 +00003701 /* XXX(nnorwitz): rather than over-allocating, it would be
3702 better to choose a different scheme. Perhaps scan the
3703 first N-chars of the string and allocate based on that size.
3704 */
3705 /* Initial allocation is based on the longest-possible unichr
3706 escape.
3707
3708 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3709 unichr, so in this case it's the longest unichr escape. In
3710 narrow (UTF-16) builds this is five chars per source unichr
3711 since there are two unichrs in the surrogate pair, so in narrow
3712 (UTF-16) builds it's not the longest unichr escape.
3713
3714 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3715 so in the narrow (UTF-16) build case it's the longest unichr
3716 escape.
3717 */
3718
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003719 if (size == 0)
3720 return PyBytes_FromStringAndSize(NULL, 0);
3721
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003722 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003724
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003725 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 2
3727 + expandsize*size
3728 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 if (repr == NULL)
3730 return NULL;
3731
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003732 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 while (size-- > 0) {
3735 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003736
Walter Dörwald79e913e2007-05-12 11:08:06 +00003737 /* Escape backslashes */
3738 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 *p++ = '\\';
3740 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003741 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003742 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003743
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003744#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003745 /* Map 21-bit characters to '\U00xxxxxx' */
3746 else if (ch >= 0x10000) {
3747 *p++ = '\\';
3748 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003749 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3750 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3751 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3752 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3753 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3754 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3755 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3756 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003758 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003759#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3761 else if (ch >= 0xD800 && ch < 0xDC00) {
3762 Py_UNICODE ch2;
3763 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 ch2 = *s++;
3766 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003767 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3769 *p++ = '\\';
3770 *p++ = 'U';
3771 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3772 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3773 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3774 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3775 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3776 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3777 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3778 *p++ = hexdigits[ucs & 0x0000000F];
3779 continue;
3780 }
3781 /* Fall through: isolated surrogates are copied as-is */
3782 s--;
3783 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003784 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003785#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003786
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003788 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 *p++ = '\\';
3790 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003791 *p++ = hexdigits[(ch >> 12) & 0x000F];
3792 *p++ = hexdigits[(ch >> 8) & 0x000F];
3793 *p++ = hexdigits[(ch >> 4) & 0x000F];
3794 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003796
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003797 /* Map special whitespace to '\t', \n', '\r' */
3798 else if (ch == '\t') {
3799 *p++ = '\\';
3800 *p++ = 't';
3801 }
3802 else if (ch == '\n') {
3803 *p++ = '\\';
3804 *p++ = 'n';
3805 }
3806 else if (ch == '\r') {
3807 *p++ = '\\';
3808 *p++ = 'r';
3809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003810
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003811 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003812 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003814 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003815 *p++ = hexdigits[(ch >> 4) & 0x000F];
3816 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003817 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003818
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 /* Copy everything else as-is */
3820 else
3821 *p++ = (char) ch;
3822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003824 assert(p - PyBytes_AS_STRING(repr) > 0);
3825 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3826 return NULL;
3827 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828}
3829
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003830PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003832 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 if (!PyUnicode_Check(unicode)) {
3834 PyErr_BadArgument();
3835 return NULL;
3836 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003837 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3838 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003839 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840}
3841
3842/* --- Raw Unicode Escape Codec ------------------------------------------- */
3843
3844PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003845 Py_ssize_t size,
3846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003849 Py_ssize_t startinpos;
3850 Py_ssize_t endinpos;
3851 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 const char *end;
3855 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 PyObject *errorHandler = NULL;
3857 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003858
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 /* Escaped strings will always be longer than the resulting
3860 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 length after conversion to the true value. (But decoding error
3862 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 v = _PyUnicode_New(size);
3864 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 end = s + size;
3870 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 unsigned char c;
3872 Py_UCS4 x;
3873 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003874 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 /* Non-escape characters are interpreted as Unicode ordinals */
3877 if (*s != '\\') {
3878 *p++ = (unsigned char)*s++;
3879 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003880 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 startinpos = s-starts;
3882
3883 /* \u-escapes are only interpreted iff the number of leading
3884 backslashes if odd */
3885 bs = s;
3886 for (;s < end;) {
3887 if (*s != '\\')
3888 break;
3889 *p++ = (unsigned char)*s++;
3890 }
3891 if (((s - bs) & 1) == 0 ||
3892 s >= end ||
3893 (*s != 'u' && *s != 'U')) {
3894 continue;
3895 }
3896 p--;
3897 count = *s=='u' ? 4 : 8;
3898 s++;
3899
3900 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3901 outpos = p-PyUnicode_AS_UNICODE(v);
3902 for (x = 0, i = 0; i < count; ++i, ++s) {
3903 c = (unsigned char)*s;
3904 if (!ISXDIGIT(c)) {
3905 endinpos = s-starts;
3906 if (unicode_decode_call_errorhandler(
3907 errors, &errorHandler,
3908 "rawunicodeescape", "truncated \\uXXXX",
3909 &starts, &end, &startinpos, &endinpos, &exc, &s,
3910 &v, &outpos, &p))
3911 goto onError;
3912 goto nextByte;
3913 }
3914 x = (x<<4) & ~0xF;
3915 if (c >= '0' && c <= '9')
3916 x += c - '0';
3917 else if (c >= 'a' && c <= 'f')
3918 x += 10 + c - 'a';
3919 else
3920 x += 10 + c - 'A';
3921 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003922 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 /* UCS-2 character */
3924 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003925 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 /* UCS-4 character. Either store directly, or as
3927 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003928#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003930#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003931 x -= 0x10000L;
3932 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3933 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003934#endif
3935 } else {
3936 endinpos = s-starts;
3937 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003938 if (unicode_decode_call_errorhandler(
3939 errors, &errorHandler,
3940 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 &starts, &end, &startinpos, &endinpos, &exc, &s,
3942 &v, &outpos, &p))
3943 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 nextByte:
3946 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003948 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 Py_XDECREF(errorHandler);
3951 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003953
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 return NULL;
3959}
3960
3961PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003964 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 char *p;
3966 char *q;
3967
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003968#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003969 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003970#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003971 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003972#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003973
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003974 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003976
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003977 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 if (repr == NULL)
3979 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003980 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003981 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003983 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 while (size-- > 0) {
3985 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003986#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 /* Map 32-bit characters to '\Uxxxxxxxx' */
3988 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003989 *p++ = '\\';
3990 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003991 *p++ = hexdigits[(ch >> 28) & 0xf];
3992 *p++ = hexdigits[(ch >> 24) & 0xf];
3993 *p++ = hexdigits[(ch >> 20) & 0xf];
3994 *p++ = hexdigits[(ch >> 16) & 0xf];
3995 *p++ = hexdigits[(ch >> 12) & 0xf];
3996 *p++ = hexdigits[(ch >> 8) & 0xf];
3997 *p++ = hexdigits[(ch >> 4) & 0xf];
3998 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003999 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004000 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004001#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4003 if (ch >= 0xD800 && ch < 0xDC00) {
4004 Py_UNICODE ch2;
4005 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004006
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 ch2 = *s++;
4008 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004009 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4011 *p++ = '\\';
4012 *p++ = 'U';
4013 *p++ = hexdigits[(ucs >> 28) & 0xf];
4014 *p++ = hexdigits[(ucs >> 24) & 0xf];
4015 *p++ = hexdigits[(ucs >> 20) & 0xf];
4016 *p++ = hexdigits[(ucs >> 16) & 0xf];
4017 *p++ = hexdigits[(ucs >> 12) & 0xf];
4018 *p++ = hexdigits[(ucs >> 8) & 0xf];
4019 *p++ = hexdigits[(ucs >> 4) & 0xf];
4020 *p++ = hexdigits[ucs & 0xf];
4021 continue;
4022 }
4023 /* Fall through: isolated surrogates are copied as-is */
4024 s--;
4025 size++;
4026 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004027#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 /* Map 16-bit characters to '\uxxxx' */
4029 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 *p++ = '\\';
4031 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004032 *p++ = hexdigits[(ch >> 12) & 0xf];
4033 *p++ = hexdigits[(ch >> 8) & 0xf];
4034 *p++ = hexdigits[(ch >> 4) & 0xf];
4035 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 /* Copy everything else as-is */
4038 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 *p++ = (char) ch;
4040 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004041 size = p - q;
4042
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004043 assert(size > 0);
4044 if (_PyBytes_Resize(&repr, size) < 0)
4045 return NULL;
4046 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047}
4048
4049PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4050{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004051 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004053 PyErr_BadArgument();
4054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004056 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4057 PyUnicode_GET_SIZE(unicode));
4058
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004059 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060}
4061
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004062/* --- Unicode Internal Codec ------------------------------------------- */
4063
4064PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 Py_ssize_t size,
4066 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004067{
4068 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004069 Py_ssize_t startinpos;
4070 Py_ssize_t endinpos;
4071 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004072 PyUnicodeObject *v;
4073 Py_UNICODE *p;
4074 const char *end;
4075 const char *reason;
4076 PyObject *errorHandler = NULL;
4077 PyObject *exc = NULL;
4078
Neal Norwitzd43069c2006-01-08 01:12:10 +00004079#ifdef Py_UNICODE_WIDE
4080 Py_UNICODE unimax = PyUnicode_GetMax();
4081#endif
4082
Thomas Wouters89f507f2006-12-13 04:49:30 +00004083 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004084 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4085 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004087 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004089 p = PyUnicode_AS_UNICODE(v);
4090 end = s + size;
4091
4092 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004093 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004094 /* We have to sanity check the raw data, otherwise doom looms for
4095 some malformed UCS-4 data. */
4096 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004097#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004098 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004099#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004100 end-s < Py_UNICODE_SIZE
4101 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004103 startinpos = s - starts;
4104 if (end-s < Py_UNICODE_SIZE) {
4105 endinpos = end-starts;
4106 reason = "truncated input";
4107 }
4108 else {
4109 endinpos = s - starts + Py_UNICODE_SIZE;
4110 reason = "illegal code point (> 0x10FFFF)";
4111 }
4112 outpos = p - PyUnicode_AS_UNICODE(v);
4113 if (unicode_decode_call_errorhandler(
4114 errors, &errorHandler,
4115 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004116 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004117 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004118 goto onError;
4119 }
4120 }
4121 else {
4122 p++;
4123 s += Py_UNICODE_SIZE;
4124 }
4125 }
4126
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004127 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004128 goto onError;
4129 Py_XDECREF(errorHandler);
4130 Py_XDECREF(exc);
4131 return (PyObject *)v;
4132
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004134 Py_XDECREF(v);
4135 Py_XDECREF(errorHandler);
4136 Py_XDECREF(exc);
4137 return NULL;
4138}
4139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140/* --- Latin-1 Codec ------------------------------------------------------ */
4141
4142PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 Py_ssize_t size,
4144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145{
4146 PyUnicodeObject *v;
4147 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004148 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004149
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004151 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 Py_UNICODE r = *(unsigned char*)s;
4153 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004154 }
4155
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 v = _PyUnicode_New(size);
4157 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004162 e = s + size;
4163 /* Unrolling the copy makes it much faster by reducing the looping
4164 overhead. This is similar to what many memcpy() implementations do. */
4165 unrolled_end = e - 4;
4166 while (s < unrolled_end) {
4167 p[0] = (unsigned char) s[0];
4168 p[1] = (unsigned char) s[1];
4169 p[2] = (unsigned char) s[2];
4170 p[3] = (unsigned char) s[3];
4171 s += 4;
4172 p += 4;
4173 }
4174 while (s < e)
4175 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 Py_XDECREF(v);
4180 return NULL;
4181}
4182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183/* create or adjust a UnicodeEncodeError */
4184static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 const char *encoding,
4186 const Py_UNICODE *unicode, Py_ssize_t size,
4187 Py_ssize_t startpos, Py_ssize_t endpos,
4188 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 *exceptionObject = PyUnicodeEncodeError_Create(
4192 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 }
4194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4196 goto onError;
4197 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4198 goto onError;
4199 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4200 goto onError;
4201 return;
4202 onError:
4203 Py_DECREF(*exceptionObject);
4204 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 }
4206}
4207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208/* raises a UnicodeEncodeError */
4209static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 const char *encoding,
4211 const Py_UNICODE *unicode, Py_ssize_t size,
4212 Py_ssize_t startpos, Py_ssize_t endpos,
4213 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214{
4215 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219}
4220
4221/* error handling callback helper:
4222 build arguments, call the callback and check the arguments,
4223 put the result into newpos and return the replacement string, which
4224 has to be freed by the caller */
4225static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 PyObject **errorHandler,
4227 const char *encoding, const char *reason,
4228 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4229 Py_ssize_t startpos, Py_ssize_t endpos,
4230 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004232 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233
4234 PyObject *restuple;
4235 PyObject *resunicode;
4236
4237 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 }
4242
4243 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247
4248 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004253 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 Py_DECREF(restuple);
4255 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004257 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 &resunicode, newpos)) {
4259 Py_DECREF(restuple);
4260 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004262 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4263 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4264 Py_DECREF(restuple);
4265 return NULL;
4266 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004269 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4271 Py_DECREF(restuple);
4272 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004273 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 Py_INCREF(resunicode);
4275 Py_DECREF(restuple);
4276 return resunicode;
4277}
4278
4279static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 Py_ssize_t size,
4281 const char *errors,
4282 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283{
4284 /* output object */
4285 PyObject *res;
4286 /* pointers to the beginning and end+1 of input */
4287 const Py_UNICODE *startp = p;
4288 const Py_UNICODE *endp = p + size;
4289 /* pointer to the beginning of the unencodable characters */
4290 /* const Py_UNICODE *badp = NULL; */
4291 /* pointer into the output */
4292 char *str;
4293 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004294 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004295 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4296 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 PyObject *errorHandler = NULL;
4298 PyObject *exc = NULL;
4299 /* the following variable is used for caching string comparisons
4300 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4301 int known_errorHandler = -1;
4302
4303 /* allocate enough for a simple encoding without
4304 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004305 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004306 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004307 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004309 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004310 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 ressize = size;
4312
4313 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 /* can we encode this? */
4317 if (c<limit) {
4318 /* no overflow check, because we know that the space is enough */
4319 *str++ = (char)c;
4320 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 else {
4323 Py_ssize_t unicodepos = p-startp;
4324 Py_ssize_t requiredsize;
4325 PyObject *repunicode;
4326 Py_ssize_t repsize;
4327 Py_ssize_t newpos;
4328 Py_ssize_t respos;
4329 Py_UNICODE *uni2;
4330 /* startpos for collecting unencodable chars */
4331 const Py_UNICODE *collstart = p;
4332 const Py_UNICODE *collend = p;
4333 /* find all unecodable characters */
4334 while ((collend < endp) && ((*collend)>=limit))
4335 ++collend;
4336 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4337 if (known_errorHandler==-1) {
4338 if ((errors==NULL) || (!strcmp(errors, "strict")))
4339 known_errorHandler = 1;
4340 else if (!strcmp(errors, "replace"))
4341 known_errorHandler = 2;
4342 else if (!strcmp(errors, "ignore"))
4343 known_errorHandler = 3;
4344 else if (!strcmp(errors, "xmlcharrefreplace"))
4345 known_errorHandler = 4;
4346 else
4347 known_errorHandler = 0;
4348 }
4349 switch (known_errorHandler) {
4350 case 1: /* strict */
4351 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4352 goto onError;
4353 case 2: /* replace */
4354 while (collstart++<collend)
4355 *str++ = '?'; /* fall through */
4356 case 3: /* ignore */
4357 p = collend;
4358 break;
4359 case 4: /* xmlcharrefreplace */
4360 respos = str - PyBytes_AS_STRING(res);
4361 /* determine replacement size (temporarily (mis)uses p) */
4362 for (p = collstart, repsize = 0; p < collend; ++p) {
4363 if (*p<10)
4364 repsize += 2+1+1;
4365 else if (*p<100)
4366 repsize += 2+2+1;
4367 else if (*p<1000)
4368 repsize += 2+3+1;
4369 else if (*p<10000)
4370 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004371#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 else
4373 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004374#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 else if (*p<100000)
4376 repsize += 2+5+1;
4377 else if (*p<1000000)
4378 repsize += 2+6+1;
4379 else
4380 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004381#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 }
4383 requiredsize = respos+repsize+(endp-collend);
4384 if (requiredsize > ressize) {
4385 if (requiredsize<2*ressize)
4386 requiredsize = 2*ressize;
4387 if (_PyBytes_Resize(&res, requiredsize))
4388 goto onError;
4389 str = PyBytes_AS_STRING(res) + respos;
4390 ressize = requiredsize;
4391 }
4392 /* generate replacement (temporarily (mis)uses p) */
4393 for (p = collstart; p < collend; ++p) {
4394 str += sprintf(str, "&#%d;", (int)*p);
4395 }
4396 p = collend;
4397 break;
4398 default:
4399 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4400 encoding, reason, startp, size, &exc,
4401 collstart-startp, collend-startp, &newpos);
4402 if (repunicode == NULL)
4403 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004404 if (PyBytes_Check(repunicode)) {
4405 /* Directly copy bytes result to output. */
4406 repsize = PyBytes_Size(repunicode);
4407 if (repsize > 1) {
4408 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004409 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004410 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4411 Py_DECREF(repunicode);
4412 goto onError;
4413 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004414 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004415 ressize += repsize-1;
4416 }
4417 memcpy(str, PyBytes_AsString(repunicode), repsize);
4418 str += repsize;
4419 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004420 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004421 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004422 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 /* need more space? (at least enough for what we
4424 have+the replacement+the rest of the string, so
4425 we won't have to check space for encodable characters) */
4426 respos = str - PyBytes_AS_STRING(res);
4427 repsize = PyUnicode_GET_SIZE(repunicode);
4428 requiredsize = respos+repsize+(endp-collend);
4429 if (requiredsize > ressize) {
4430 if (requiredsize<2*ressize)
4431 requiredsize = 2*ressize;
4432 if (_PyBytes_Resize(&res, requiredsize)) {
4433 Py_DECREF(repunicode);
4434 goto onError;
4435 }
4436 str = PyBytes_AS_STRING(res) + respos;
4437 ressize = requiredsize;
4438 }
4439 /* check if there is anything unencodable in the replacement
4440 and copy it to the output */
4441 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4442 c = *uni2;
4443 if (c >= limit) {
4444 raise_encode_exception(&exc, encoding, startp, size,
4445 unicodepos, unicodepos+1, reason);
4446 Py_DECREF(repunicode);
4447 goto onError;
4448 }
4449 *str = (char)c;
4450 }
4451 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004452 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004454 }
4455 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004456 /* Resize if we allocated to much */
4457 size = str - PyBytes_AS_STRING(res);
4458 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004459 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004460 if (_PyBytes_Resize(&res, size) < 0)
4461 goto onError;
4462 }
4463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004466 return res;
4467
4468 onError:
4469 Py_XDECREF(res);
4470 Py_XDECREF(errorHandler);
4471 Py_XDECREF(exc);
4472 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473}
4474
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 Py_ssize_t size,
4477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480}
4481
4482PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4483{
4484 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 PyErr_BadArgument();
4486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 }
4488 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 PyUnicode_GET_SIZE(unicode),
4490 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491}
4492
4493/* --- 7-bit ASCII Codec -------------------------------------------------- */
4494
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 Py_ssize_t size,
4497 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 PyUnicodeObject *v;
4501 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 Py_ssize_t startinpos;
4503 Py_ssize_t endinpos;
4504 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 const char *e;
4506 PyObject *errorHandler = NULL;
4507 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004508
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004510 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 Py_UNICODE r = *(unsigned char*)s;
4512 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004513 }
Tim Petersced69f82003-09-16 20:30:58 +00004514
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 v = _PyUnicode_New(size);
4516 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 e = s + size;
4522 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 register unsigned char c = (unsigned char)*s;
4524 if (c < 128) {
4525 *p++ = c;
4526 ++s;
4527 }
4528 else {
4529 startinpos = s-starts;
4530 endinpos = startinpos + 1;
4531 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4532 if (unicode_decode_call_errorhandler(
4533 errors, &errorHandler,
4534 "ascii", "ordinal not in range(128)",
4535 &starts, &e, &startinpos, &endinpos, &exc, &s,
4536 &v, &outpos, &p))
4537 goto onError;
4538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004540 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 Py_XDECREF(errorHandler);
4544 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004546
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 Py_XDECREF(errorHandler);
4550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 return NULL;
4552}
4553
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 Py_ssize_t size,
4556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559}
4560
4561PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4562{
4563 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 PyErr_BadArgument();
4565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566 }
4567 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 PyUnicode_GET_SIZE(unicode),
4569 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570}
4571
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004572#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004573
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004574/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004575
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004576#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004577#define NEED_RETRY
4578#endif
4579
4580/* XXX This code is limited to "true" double-byte encodings, as
4581 a) it assumes an incomplete character consists of a single byte, and
4582 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004584
4585static int is_dbcs_lead_byte(const char *s, int offset)
4586{
4587 const char *curr = s + offset;
4588
4589 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 const char *prev = CharPrev(s, curr);
4591 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004592 }
4593 return 0;
4594}
4595
4596/*
4597 * Decode MBCS string into unicode object. If 'final' is set, converts
4598 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4599 */
4600static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 const char *s, /* MBCS string */
4602 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004603 int final,
4604 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605{
4606 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004607 Py_ssize_t n;
4608 DWORD usize;
4609 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004610
4611 assert(size >= 0);
4612
Victor Stinner554f3f02010-06-16 23:33:54 +00004613 /* check and handle 'errors' arg */
4614 if (errors==NULL || strcmp(errors, "strict")==0)
4615 flags = MB_ERR_INVALID_CHARS;
4616 else if (strcmp(errors, "ignore")==0)
4617 flags = 0;
4618 else {
4619 PyErr_Format(PyExc_ValueError,
4620 "mbcs encoding does not support errors='%s'",
4621 errors);
4622 return -1;
4623 }
4624
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004625 /* Skip trailing lead-byte unless 'final' is set */
4626 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004628
4629 /* First get the size of the result */
4630 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004631 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4632 if (usize==0)
4633 goto mbcs_decode_error;
4634 } else
4635 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004636
4637 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 /* Create unicode object */
4639 *v = _PyUnicode_New(usize);
4640 if (*v == NULL)
4641 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004642 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004643 }
4644 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 /* Extend unicode object */
4646 n = PyUnicode_GET_SIZE(*v);
4647 if (_PyUnicode_Resize(v, n + usize) < 0)
4648 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004649 }
4650
4651 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004652 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004654 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4655 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004657 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004659
4660mbcs_decode_error:
4661 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4662 we raise a UnicodeDecodeError - else it is a 'generic'
4663 windows error
4664 */
4665 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4666 /* Ideally, we should get reason from FormatMessage - this
4667 is the Windows 2000 English version of the message
4668 */
4669 PyObject *exc = NULL;
4670 const char *reason = "No mapping for the Unicode character exists "
4671 "in the target multi-byte code page.";
4672 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4673 if (exc != NULL) {
4674 PyCodec_StrictErrors(exc);
4675 Py_DECREF(exc);
4676 }
4677 } else {
4678 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4679 }
4680 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004681}
4682
4683PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 Py_ssize_t size,
4685 const char *errors,
4686 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687{
4688 PyUnicodeObject *v = NULL;
4689 int done;
4690
4691 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004693
4694#ifdef NEED_RETRY
4695 retry:
4696 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004697 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004698 else
4699#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004700 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004701
4702 if (done < 0) {
4703 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004705 }
4706
4707 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004709
4710#ifdef NEED_RETRY
4711 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 s += done;
4713 size -= done;
4714 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004715 }
4716#endif
4717
4718 return (PyObject *)v;
4719}
4720
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004721PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 Py_ssize_t size,
4723 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004724{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004725 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4726}
4727
4728/*
4729 * Convert unicode into string object (MBCS).
4730 * Returns 0 if succeed, -1 otherwise.
4731 */
4732static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004734 int size, /* size of unicode */
4735 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004736{
Victor Stinner554f3f02010-06-16 23:33:54 +00004737 BOOL usedDefaultChar = FALSE;
4738 BOOL *pusedDefaultChar;
4739 int mbcssize;
4740 Py_ssize_t n;
4741 PyObject *exc = NULL;
4742 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004743
4744 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004745
Victor Stinner554f3f02010-06-16 23:33:54 +00004746 /* check and handle 'errors' arg */
4747 if (errors==NULL || strcmp(errors, "strict")==0) {
4748 flags = WC_NO_BEST_FIT_CHARS;
4749 pusedDefaultChar = &usedDefaultChar;
4750 } else if (strcmp(errors, "replace")==0) {
4751 flags = 0;
4752 pusedDefaultChar = NULL;
4753 } else {
4754 PyErr_Format(PyExc_ValueError,
4755 "mbcs encoding does not support errors='%s'",
4756 errors);
4757 return -1;
4758 }
4759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004760 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004761 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004762 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4763 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 if (mbcssize == 0) {
4765 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4766 return -1;
4767 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004768 /* If we used a default char, then we failed! */
4769 if (pusedDefaultChar && *pusedDefaultChar)
4770 goto mbcs_encode_error;
4771 } else {
4772 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004773 }
4774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004775 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 /* Create string object */
4777 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4778 if (*repr == NULL)
4779 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004780 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004781 }
4782 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 /* Extend string object */
4784 n = PyBytes_Size(*repr);
4785 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4786 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004787 }
4788
4789 /* Do the conversion */
4790 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004792 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4793 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4795 return -1;
4796 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004797 if (pusedDefaultChar && *pusedDefaultChar)
4798 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004799 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004800 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004801
4802mbcs_encode_error:
4803 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4804 Py_XDECREF(exc);
4805 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004806}
4807
4808PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 Py_ssize_t size,
4810 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004811{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004812 PyObject *repr = NULL;
4813 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004814
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004815#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004817 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004818 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004819 else
4820#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004821 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004822
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004823 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 Py_XDECREF(repr);
4825 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004826 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004827
4828#ifdef NEED_RETRY
4829 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 p += INT_MAX;
4831 size -= INT_MAX;
4832 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004833 }
4834#endif
4835
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004836 return repr;
4837}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004838
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004839PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4840{
4841 if (!PyUnicode_Check(unicode)) {
4842 PyErr_BadArgument();
4843 return NULL;
4844 }
4845 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 PyUnicode_GET_SIZE(unicode),
4847 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004848}
4849
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004850#undef NEED_RETRY
4851
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004852#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854/* --- Character Mapping Codec -------------------------------------------- */
4855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 Py_ssize_t size,
4858 PyObject *mapping,
4859 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t startinpos;
4863 Py_ssize_t endinpos;
4864 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 PyUnicodeObject *v;
4867 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 PyObject *errorHandler = NULL;
4870 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004871 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004873
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 /* Default to Latin-1 */
4875 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877
4878 v = _PyUnicode_New(size);
4879 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004885 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 mapstring = PyUnicode_AS_UNICODE(mapping);
4887 maplen = PyUnicode_GET_SIZE(mapping);
4888 while (s < e) {
4889 unsigned char ch = *s;
4890 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 if (ch < maplen)
4893 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 if (x == 0xfffe) {
4896 /* undefined mapping */
4897 outpos = p-PyUnicode_AS_UNICODE(v);
4898 startinpos = s-starts;
4899 endinpos = startinpos+1;
4900 if (unicode_decode_call_errorhandler(
4901 errors, &errorHandler,
4902 "charmap", "character maps to <undefined>",
4903 &starts, &e, &startinpos, &endinpos, &exc, &s,
4904 &v, &outpos, &p)) {
4905 goto onError;
4906 }
4907 continue;
4908 }
4909 *p++ = x;
4910 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004911 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004912 }
4913 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 while (s < e) {
4915 unsigned char ch = *s;
4916 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004917
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4919 w = PyLong_FromLong((long)ch);
4920 if (w == NULL)
4921 goto onError;
4922 x = PyObject_GetItem(mapping, w);
4923 Py_DECREF(w);
4924 if (x == NULL) {
4925 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4926 /* No mapping found means: mapping is undefined. */
4927 PyErr_Clear();
4928 x = Py_None;
4929 Py_INCREF(x);
4930 } else
4931 goto onError;
4932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004933
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 /* Apply mapping */
4935 if (PyLong_Check(x)) {
4936 long value = PyLong_AS_LONG(x);
4937 if (value < 0 || value > 65535) {
4938 PyErr_SetString(PyExc_TypeError,
4939 "character mapping must be in range(65536)");
4940 Py_DECREF(x);
4941 goto onError;
4942 }
4943 *p++ = (Py_UNICODE)value;
4944 }
4945 else if (x == Py_None) {
4946 /* undefined mapping */
4947 outpos = p-PyUnicode_AS_UNICODE(v);
4948 startinpos = s-starts;
4949 endinpos = startinpos+1;
4950 if (unicode_decode_call_errorhandler(
4951 errors, &errorHandler,
4952 "charmap", "character maps to <undefined>",
4953 &starts, &e, &startinpos, &endinpos, &exc, &s,
4954 &v, &outpos, &p)) {
4955 Py_DECREF(x);
4956 goto onError;
4957 }
4958 Py_DECREF(x);
4959 continue;
4960 }
4961 else if (PyUnicode_Check(x)) {
4962 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004963
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 if (targetsize == 1)
4965 /* 1-1 mapping */
4966 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004967
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 else if (targetsize > 1) {
4969 /* 1-n mapping */
4970 if (targetsize > extrachars) {
4971 /* resize first */
4972 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4973 Py_ssize_t needed = (targetsize - extrachars) + \
4974 (targetsize << 2);
4975 extrachars += needed;
4976 /* XXX overflow detection missing */
4977 if (_PyUnicode_Resize(&v,
4978 PyUnicode_GET_SIZE(v) + needed) < 0) {
4979 Py_DECREF(x);
4980 goto onError;
4981 }
4982 p = PyUnicode_AS_UNICODE(v) + oldpos;
4983 }
4984 Py_UNICODE_COPY(p,
4985 PyUnicode_AS_UNICODE(x),
4986 targetsize);
4987 p += targetsize;
4988 extrachars -= targetsize;
4989 }
4990 /* 1-0 mapping: skip the character */
4991 }
4992 else {
4993 /* wrong return value */
4994 PyErr_SetString(PyExc_TypeError,
4995 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004996 Py_DECREF(x);
4997 goto onError;
4998 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 Py_DECREF(x);
5000 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
5003 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 Py_XDECREF(errorHandler);
5007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005009
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 Py_XDECREF(errorHandler);
5012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 Py_XDECREF(v);
5014 return NULL;
5015}
5016
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005017/* Charmap encoding: the lookup table */
5018
5019struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 PyObject_HEAD
5021 unsigned char level1[32];
5022 int count2, count3;
5023 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005024};
5025
5026static PyObject*
5027encoding_map_size(PyObject *obj, PyObject* args)
5028{
5029 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005030 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005032}
5033
5034static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005035 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 PyDoc_STR("Return the size (in bytes) of this object") },
5037 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005038};
5039
5040static void
5041encoding_map_dealloc(PyObject* o)
5042{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005043 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005044}
5045
5046static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005047 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 "EncodingMap", /*tp_name*/
5049 sizeof(struct encoding_map), /*tp_basicsize*/
5050 0, /*tp_itemsize*/
5051 /* methods */
5052 encoding_map_dealloc, /*tp_dealloc*/
5053 0, /*tp_print*/
5054 0, /*tp_getattr*/
5055 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005056 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 0, /*tp_repr*/
5058 0, /*tp_as_number*/
5059 0, /*tp_as_sequence*/
5060 0, /*tp_as_mapping*/
5061 0, /*tp_hash*/
5062 0, /*tp_call*/
5063 0, /*tp_str*/
5064 0, /*tp_getattro*/
5065 0, /*tp_setattro*/
5066 0, /*tp_as_buffer*/
5067 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5068 0, /*tp_doc*/
5069 0, /*tp_traverse*/
5070 0, /*tp_clear*/
5071 0, /*tp_richcompare*/
5072 0, /*tp_weaklistoffset*/
5073 0, /*tp_iter*/
5074 0, /*tp_iternext*/
5075 encoding_map_methods, /*tp_methods*/
5076 0, /*tp_members*/
5077 0, /*tp_getset*/
5078 0, /*tp_base*/
5079 0, /*tp_dict*/
5080 0, /*tp_descr_get*/
5081 0, /*tp_descr_set*/
5082 0, /*tp_dictoffset*/
5083 0, /*tp_init*/
5084 0, /*tp_alloc*/
5085 0, /*tp_new*/
5086 0, /*tp_free*/
5087 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088};
5089
5090PyObject*
5091PyUnicode_BuildEncodingMap(PyObject* string)
5092{
5093 Py_UNICODE *decode;
5094 PyObject *result;
5095 struct encoding_map *mresult;
5096 int i;
5097 int need_dict = 0;
5098 unsigned char level1[32];
5099 unsigned char level2[512];
5100 unsigned char *mlevel1, *mlevel2, *mlevel3;
5101 int count2 = 0, count3 = 0;
5102
5103 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5104 PyErr_BadArgument();
5105 return NULL;
5106 }
5107 decode = PyUnicode_AS_UNICODE(string);
5108 memset(level1, 0xFF, sizeof level1);
5109 memset(level2, 0xFF, sizeof level2);
5110
5111 /* If there isn't a one-to-one mapping of NULL to \0,
5112 or if there are non-BMP characters, we need to use
5113 a mapping dictionary. */
5114 if (decode[0] != 0)
5115 need_dict = 1;
5116 for (i = 1; i < 256; i++) {
5117 int l1, l2;
5118 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005119#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005120 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005121#endif
5122 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005123 need_dict = 1;
5124 break;
5125 }
5126 if (decode[i] == 0xFFFE)
5127 /* unmapped character */
5128 continue;
5129 l1 = decode[i] >> 11;
5130 l2 = decode[i] >> 7;
5131 if (level1[l1] == 0xFF)
5132 level1[l1] = count2++;
5133 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005134 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005135 }
5136
5137 if (count2 >= 0xFF || count3 >= 0xFF)
5138 need_dict = 1;
5139
5140 if (need_dict) {
5141 PyObject *result = PyDict_New();
5142 PyObject *key, *value;
5143 if (!result)
5144 return NULL;
5145 for (i = 0; i < 256; i++) {
5146 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005147 key = PyLong_FromLong(decode[i]);
5148 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005149 if (!key || !value)
5150 goto failed1;
5151 if (PyDict_SetItem(result, key, value) == -1)
5152 goto failed1;
5153 Py_DECREF(key);
5154 Py_DECREF(value);
5155 }
5156 return result;
5157 failed1:
5158 Py_XDECREF(key);
5159 Py_XDECREF(value);
5160 Py_DECREF(result);
5161 return NULL;
5162 }
5163
5164 /* Create a three-level trie */
5165 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5166 16*count2 + 128*count3 - 1);
5167 if (!result)
5168 return PyErr_NoMemory();
5169 PyObject_Init(result, &EncodingMapType);
5170 mresult = (struct encoding_map*)result;
5171 mresult->count2 = count2;
5172 mresult->count3 = count3;
5173 mlevel1 = mresult->level1;
5174 mlevel2 = mresult->level23;
5175 mlevel3 = mresult->level23 + 16*count2;
5176 memcpy(mlevel1, level1, 32);
5177 memset(mlevel2, 0xFF, 16*count2);
5178 memset(mlevel3, 0, 128*count3);
5179 count3 = 0;
5180 for (i = 1; i < 256; i++) {
5181 int o1, o2, o3, i2, i3;
5182 if (decode[i] == 0xFFFE)
5183 /* unmapped character */
5184 continue;
5185 o1 = decode[i]>>11;
5186 o2 = (decode[i]>>7) & 0xF;
5187 i2 = 16*mlevel1[o1] + o2;
5188 if (mlevel2[i2] == 0xFF)
5189 mlevel2[i2] = count3++;
5190 o3 = decode[i] & 0x7F;
5191 i3 = 128*mlevel2[i2] + o3;
5192 mlevel3[i3] = i;
5193 }
5194 return result;
5195}
5196
5197static int
5198encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5199{
5200 struct encoding_map *map = (struct encoding_map*)mapping;
5201 int l1 = c>>11;
5202 int l2 = (c>>7) & 0xF;
5203 int l3 = c & 0x7F;
5204 int i;
5205
5206#ifdef Py_UNICODE_WIDE
5207 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005209 }
5210#endif
5211 if (c == 0)
5212 return 0;
5213 /* level 1*/
5214 i = map->level1[l1];
5215 if (i == 0xFF) {
5216 return -1;
5217 }
5218 /* level 2*/
5219 i = map->level23[16*i+l2];
5220 if (i == 0xFF) {
5221 return -1;
5222 }
5223 /* level 3 */
5224 i = map->level23[16*map->count2 + 128*i + l3];
5225 if (i == 0) {
5226 return -1;
5227 }
5228 return i;
5229}
5230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231/* Lookup the character ch in the mapping. If the character
5232 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005233 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235{
Christian Heimes217cfd12007-12-02 14:31:20 +00005236 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 PyObject *x;
5238
5239 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 x = PyObject_GetItem(mapping, w);
5242 Py_DECREF(w);
5243 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5245 /* No mapping found means: mapping is undefined. */
5246 PyErr_Clear();
5247 x = Py_None;
5248 Py_INCREF(x);
5249 return x;
5250 } else
5251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005253 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005255 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 long value = PyLong_AS_LONG(x);
5257 if (value < 0 || value > 255) {
5258 PyErr_SetString(PyExc_TypeError,
5259 "character mapping must be in range(256)");
5260 Py_DECREF(x);
5261 return NULL;
5262 }
5263 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005265 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 /* wrong return value */
5269 PyErr_Format(PyExc_TypeError,
5270 "character mapping must return integer, bytes or None, not %.400s",
5271 x->ob_type->tp_name);
5272 Py_DECREF(x);
5273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 }
5275}
5276
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005277static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005278charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005279{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005280 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5281 /* exponentially overallocate to minimize reallocations */
5282 if (requiredsize < 2*outsize)
5283 requiredsize = 2*outsize;
5284 if (_PyBytes_Resize(outobj, requiredsize))
5285 return -1;
5286 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005287}
5288
Benjamin Peterson14339b62009-01-31 16:36:08 +00005289typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005291}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005292/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005293 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 space is available. Return a new reference to the object that
5295 was put in the output buffer, or Py_None, if the mapping was undefined
5296 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005297 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005299charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005302 PyObject *rep;
5303 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005304 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305
Christian Heimes90aa7642007-12-19 02:45:37 +00005306 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005309 if (res == -1)
5310 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 if (outsize<requiredsize)
5312 if (charmapencode_resize(outobj, outpos, requiredsize))
5313 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005314 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 outstart[(*outpos)++] = (char)res;
5316 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005317 }
5318
5319 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005322 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 Py_DECREF(rep);
5324 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005325 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 if (PyLong_Check(rep)) {
5327 Py_ssize_t requiredsize = *outpos+1;
5328 if (outsize<requiredsize)
5329 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5330 Py_DECREF(rep);
5331 return enc_EXCEPTION;
5332 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005333 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 else {
5337 const char *repchars = PyBytes_AS_STRING(rep);
5338 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5339 Py_ssize_t requiredsize = *outpos+repsize;
5340 if (outsize<requiredsize)
5341 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5342 Py_DECREF(rep);
5343 return enc_EXCEPTION;
5344 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005345 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 memcpy(outstart + *outpos, repchars, repsize);
5347 *outpos += repsize;
5348 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005350 Py_DECREF(rep);
5351 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352}
5353
5354/* handle an error in PyUnicode_EncodeCharmap
5355 Return 0 on success, -1 on error */
5356static
5357int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005360 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005361 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362{
5363 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364 Py_ssize_t repsize;
5365 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 Py_UNICODE *uni2;
5367 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 Py_ssize_t collstartpos = *inpos;
5369 Py_ssize_t collendpos = *inpos+1;
5370 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 char *encoding = "charmap";
5372 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005373 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 /* find all unencodable characters */
5376 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005377 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005378 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 int res = encoding_map_lookup(p[collendpos], mapping);
5380 if (res != -1)
5381 break;
5382 ++collendpos;
5383 continue;
5384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005385
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 rep = charmapencode_lookup(p[collendpos], mapping);
5387 if (rep==NULL)
5388 return -1;
5389 else if (rep!=Py_None) {
5390 Py_DECREF(rep);
5391 break;
5392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005393 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 }
5396 /* cache callback name lookup
5397 * (if not done yet, i.e. it's the first error) */
5398 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 if ((errors==NULL) || (!strcmp(errors, "strict")))
5400 *known_errorHandler = 1;
5401 else if (!strcmp(errors, "replace"))
5402 *known_errorHandler = 2;
5403 else if (!strcmp(errors, "ignore"))
5404 *known_errorHandler = 3;
5405 else if (!strcmp(errors, "xmlcharrefreplace"))
5406 *known_errorHandler = 4;
5407 else
5408 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 }
5410 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005411 case 1: /* strict */
5412 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5413 return -1;
5414 case 2: /* replace */
5415 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 x = charmapencode_output('?', mapping, res, respos);
5417 if (x==enc_EXCEPTION) {
5418 return -1;
5419 }
5420 else if (x==enc_FAILED) {
5421 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5422 return -1;
5423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005424 }
5425 /* fall through */
5426 case 3: /* ignore */
5427 *inpos = collendpos;
5428 break;
5429 case 4: /* xmlcharrefreplace */
5430 /* generate replacement (temporarily (mis)uses p) */
5431 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 char buffer[2+29+1+1];
5433 char *cp;
5434 sprintf(buffer, "&#%d;", (int)p[collpos]);
5435 for (cp = buffer; *cp; ++cp) {
5436 x = charmapencode_output(*cp, mapping, res, respos);
5437 if (x==enc_EXCEPTION)
5438 return -1;
5439 else if (x==enc_FAILED) {
5440 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5441 return -1;
5442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005443 }
5444 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005445 *inpos = collendpos;
5446 break;
5447 default:
5448 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 encoding, reason, p, size, exceptionObject,
5450 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005451 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005453 if (PyBytes_Check(repunicode)) {
5454 /* Directly copy bytes result to output. */
5455 Py_ssize_t outsize = PyBytes_Size(*res);
5456 Py_ssize_t requiredsize;
5457 repsize = PyBytes_Size(repunicode);
5458 requiredsize = *respos + repsize;
5459 if (requiredsize > outsize)
5460 /* Make room for all additional bytes. */
5461 if (charmapencode_resize(res, respos, requiredsize)) {
5462 Py_DECREF(repunicode);
5463 return -1;
5464 }
5465 memcpy(PyBytes_AsString(*res) + *respos,
5466 PyBytes_AsString(repunicode), repsize);
5467 *respos += repsize;
5468 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005469 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005470 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005472 /* generate replacement */
5473 repsize = PyUnicode_GET_SIZE(repunicode);
5474 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 x = charmapencode_output(*uni2, mapping, res, respos);
5476 if (x==enc_EXCEPTION) {
5477 return -1;
5478 }
5479 else if (x==enc_FAILED) {
5480 Py_DECREF(repunicode);
5481 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5482 return -1;
5483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005484 }
5485 *inpos = newpos;
5486 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 }
5488 return 0;
5489}
5490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 Py_ssize_t size,
5493 PyObject *mapping,
5494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 /* output object */
5497 PyObject *res = NULL;
5498 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 PyObject *errorHandler = NULL;
5503 PyObject *exc = NULL;
5504 /* the following variable is used for caching string comparisons
5505 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5506 * 3=ignore, 4=xmlcharrefreplace */
5507 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
5509 /* Default to Latin-1 */
5510 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 /* allocate enough for a simple encoding without
5514 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005515 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 if (res == NULL)
5517 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005518 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 /* try to encode it */
5523 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5524 if (x==enc_EXCEPTION) /* error */
5525 goto onError;
5526 if (x==enc_FAILED) { /* unencodable character */
5527 if (charmap_encoding_error(p, size, &inpos, mapping,
5528 &exc,
5529 &known_errorHandler, &errorHandler, errors,
5530 &res, &respos)) {
5531 goto onError;
5532 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 else
5535 /* done with this character => adjust input position */
5536 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005540 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005541 if (_PyBytes_Resize(&res, respos) < 0)
5542 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 Py_XDECREF(exc);
5545 Py_XDECREF(errorHandler);
5546 return res;
5547
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 Py_XDECREF(res);
5550 Py_XDECREF(exc);
5551 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 return NULL;
5553}
5554
5555PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557{
5558 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 PyErr_BadArgument();
5560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 }
5562 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 PyUnicode_GET_SIZE(unicode),
5564 mapping,
5565 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566}
5567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568/* create or adjust a UnicodeTranslateError */
5569static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 const Py_UNICODE *unicode, Py_ssize_t size,
5571 Py_ssize_t startpos, Py_ssize_t endpos,
5572 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005575 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
5578 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5580 goto onError;
5581 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5582 goto onError;
5583 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5584 goto onError;
5585 return;
5586 onError:
5587 Py_DECREF(*exceptionObject);
5588 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
5590}
5591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592/* raises a UnicodeTranslateError */
5593static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 const Py_UNICODE *unicode, Py_ssize_t size,
5595 Py_ssize_t startpos, Py_ssize_t endpos,
5596 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597{
5598 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602}
5603
5604/* error handling callback helper:
5605 build arguments, call the callback and check the arguments,
5606 put the result into newpos and return the replacement string, which
5607 has to be freed by the caller */
5608static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 PyObject **errorHandler,
5610 const char *reason,
5611 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5612 Py_ssize_t startpos, Py_ssize_t endpos,
5613 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005615 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005617 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *restuple;
5619 PyObject *resunicode;
5620
5621 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 }
5626
5627 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631
5632 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005637 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 Py_DECREF(restuple);
5639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 }
5641 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 &resunicode, &i_newpos)) {
5643 Py_DECREF(restuple);
5644 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 else
5649 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005650 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5652 Py_DECREF(restuple);
5653 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005654 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 Py_INCREF(resunicode);
5656 Py_DECREF(restuple);
5657 return resunicode;
5658}
5659
5660/* Lookup the character ch in the mapping and put the result in result,
5661 which must be decrefed by the caller.
5662 Return 0 on success, -1 on error */
5663static
5664int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5665{
Christian Heimes217cfd12007-12-02 14:31:20 +00005666 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 PyObject *x;
5668
5669 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 x = PyObject_GetItem(mapping, w);
5672 Py_DECREF(w);
5673 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5675 /* No mapping found means: use 1:1 mapping. */
5676 PyErr_Clear();
5677 *result = NULL;
5678 return 0;
5679 } else
5680 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681 }
5682 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 *result = x;
5684 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005686 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 long value = PyLong_AS_LONG(x);
5688 long max = PyUnicode_GetMax();
5689 if (value < 0 || value > max) {
5690 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005691 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 Py_DECREF(x);
5693 return -1;
5694 }
5695 *result = x;
5696 return 0;
5697 }
5698 else if (PyUnicode_Check(x)) {
5699 *result = x;
5700 return 0;
5701 }
5702 else {
5703 /* wrong return value */
5704 PyErr_SetString(PyExc_TypeError,
5705 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 Py_DECREF(x);
5707 return -1;
5708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709}
5710/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 if not reallocate and adjust various state variables.
5712 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713static
Walter Dörwald4894c302003-10-24 14:25:28 +00005714int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005718 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* remember old output position */
5720 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5721 /* exponentially overallocate to minimize reallocations */
5722 if (requiredsize < 2 * oldsize)
5723 requiredsize = 2 * oldsize;
5724 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5725 return -1;
5726 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 }
5728 return 0;
5729}
5730/* lookup the character, put the result in the output string and adjust
5731 various state variables. Return a new reference to the object that
5732 was put in the output buffer in *result, or Py_None, if the mapping was
5733 undefined (in which case no character was written).
5734 The called must decref result.
5735 Return 0 on success, -1 on error. */
5736static
Walter Dörwald4894c302003-10-24 14:25:28 +00005737int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5739 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740{
Walter Dörwald4894c302003-10-24 14:25:28 +00005741 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 /* not found => default to 1:1 mapping */
5745 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 }
5747 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005749 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 /* no overflow check, because we know that the space is enough */
5751 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 }
5753 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5755 if (repsize==1) {
5756 /* no overflow check, because we know that the space is enough */
5757 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5758 }
5759 else if (repsize!=0) {
5760 /* more than one character */
5761 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5762 (insize - (curinp-startinp)) +
5763 repsize - 1;
5764 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5765 return -1;
5766 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5767 *outp += repsize;
5768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 }
5770 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 return 0;
5773}
5774
5775PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 Py_ssize_t size,
5777 PyObject *mapping,
5778 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 /* output object */
5781 PyObject *res = NULL;
5782 /* pointers to the beginning and end+1 of input */
5783 const Py_UNICODE *startp = p;
5784 const Py_UNICODE *endp = p + size;
5785 /* pointer into the output */
5786 Py_UNICODE *str;
5787 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 char *reason = "character maps to <undefined>";
5790 PyObject *errorHandler = NULL;
5791 PyObject *exc = NULL;
5792 /* the following variable is used for caching string comparisons
5793 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5794 * 3=ignore, 4=xmlcharrefreplace */
5795 int known_errorHandler = -1;
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 PyErr_BadArgument();
5799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801
5802 /* allocate enough for a simple 1:1 translation without
5803 replacements, if we need more, we'll resize */
5804 res = PyUnicode_FromUnicode(NULL, size);
5805 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 /* try to encode it */
5813 PyObject *x = NULL;
5814 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5815 Py_XDECREF(x);
5816 goto onError;
5817 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005818 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 if (x!=Py_None) /* it worked => adjust input pointer */
5820 ++p;
5821 else { /* untranslatable character */
5822 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5823 Py_ssize_t repsize;
5824 Py_ssize_t newpos;
5825 Py_UNICODE *uni2;
5826 /* startpos for collecting untranslatable chars */
5827 const Py_UNICODE *collstart = p;
5828 const Py_UNICODE *collend = p+1;
5829 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 /* find all untranslatable characters */
5832 while (collend < endp) {
5833 if (charmaptranslate_lookup(*collend, mapping, &x))
5834 goto onError;
5835 Py_XDECREF(x);
5836 if (x!=Py_None)
5837 break;
5838 ++collend;
5839 }
5840 /* cache callback name lookup
5841 * (if not done yet, i.e. it's the first error) */
5842 if (known_errorHandler==-1) {
5843 if ((errors==NULL) || (!strcmp(errors, "strict")))
5844 known_errorHandler = 1;
5845 else if (!strcmp(errors, "replace"))
5846 known_errorHandler = 2;
5847 else if (!strcmp(errors, "ignore"))
5848 known_errorHandler = 3;
5849 else if (!strcmp(errors, "xmlcharrefreplace"))
5850 known_errorHandler = 4;
5851 else
5852 known_errorHandler = 0;
5853 }
5854 switch (known_errorHandler) {
5855 case 1: /* strict */
5856 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005857 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 case 2: /* replace */
5859 /* No need to check for space, this is a 1:1 replacement */
5860 for (coll = collstart; coll<collend; ++coll)
5861 *str++ = '?';
5862 /* fall through */
5863 case 3: /* ignore */
5864 p = collend;
5865 break;
5866 case 4: /* xmlcharrefreplace */
5867 /* generate replacement (temporarily (mis)uses p) */
5868 for (p = collstart; p < collend; ++p) {
5869 char buffer[2+29+1+1];
5870 char *cp;
5871 sprintf(buffer, "&#%d;", (int)*p);
5872 if (charmaptranslate_makespace(&res, &str,
5873 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5874 goto onError;
5875 for (cp = buffer; *cp; ++cp)
5876 *str++ = *cp;
5877 }
5878 p = collend;
5879 break;
5880 default:
5881 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5882 reason, startp, size, &exc,
5883 collstart-startp, collend-startp, &newpos);
5884 if (repunicode == NULL)
5885 goto onError;
5886 /* generate replacement */
5887 repsize = PyUnicode_GET_SIZE(repunicode);
5888 if (charmaptranslate_makespace(&res, &str,
5889 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5890 Py_DECREF(repunicode);
5891 goto onError;
5892 }
5893 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5894 *str++ = *uni2;
5895 p = startp + newpos;
5896 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005898 }
5899 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 /* Resize if we allocated to much */
5901 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005902 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 if (PyUnicode_Resize(&res, respos) < 0)
5904 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 }
5906 Py_XDECREF(exc);
5907 Py_XDECREF(errorHandler);
5908 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_XDECREF(res);
5912 Py_XDECREF(exc);
5913 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 return NULL;
5915}
5916
5917PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 PyObject *mapping,
5919 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
5921 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 str = PyUnicode_FromObject(str);
5924 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyUnicode_GET_SIZE(str),
5928 mapping,
5929 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 Py_DECREF(str);
5931 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 Py_XDECREF(str);
5935 return NULL;
5936}
Tim Petersced69f82003-09-16 20:30:58 +00005937
Guido van Rossum9e896b32000-04-05 20:11:21 +00005938/* --- Decimal Encoder ---------------------------------------------------- */
5939
5940int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 Py_ssize_t length,
5942 char *output,
5943 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005944{
5945 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 PyObject *errorHandler = NULL;
5947 PyObject *exc = NULL;
5948 const char *encoding = "decimal";
5949 const char *reason = "invalid decimal Unicode string";
5950 /* the following variable is used for caching string comparisons
5951 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5952 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005953
5954 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 PyErr_BadArgument();
5956 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005957 }
5958
5959 p = s;
5960 end = s + length;
5961 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 register Py_UNICODE ch = *p;
5963 int decimal;
5964 PyObject *repunicode;
5965 Py_ssize_t repsize;
5966 Py_ssize_t newpos;
5967 Py_UNICODE *uni2;
5968 Py_UNICODE *collstart;
5969 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005972 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 ++p;
5974 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 decimal = Py_UNICODE_TODECIMAL(ch);
5977 if (decimal >= 0) {
5978 *output++ = '0' + decimal;
5979 ++p;
5980 continue;
5981 }
5982 if (0 < ch && ch < 256) {
5983 *output++ = (char)ch;
5984 ++p;
5985 continue;
5986 }
5987 /* All other characters are considered unencodable */
5988 collstart = p;
5989 collend = p+1;
5990 while (collend < end) {
5991 if ((0 < *collend && *collend < 256) ||
5992 !Py_UNICODE_ISSPACE(*collend) ||
5993 Py_UNICODE_TODECIMAL(*collend))
5994 break;
5995 }
5996 /* cache callback name lookup
5997 * (if not done yet, i.e. it's the first error) */
5998 if (known_errorHandler==-1) {
5999 if ((errors==NULL) || (!strcmp(errors, "strict")))
6000 known_errorHandler = 1;
6001 else if (!strcmp(errors, "replace"))
6002 known_errorHandler = 2;
6003 else if (!strcmp(errors, "ignore"))
6004 known_errorHandler = 3;
6005 else if (!strcmp(errors, "xmlcharrefreplace"))
6006 known_errorHandler = 4;
6007 else
6008 known_errorHandler = 0;
6009 }
6010 switch (known_errorHandler) {
6011 case 1: /* strict */
6012 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6013 goto onError;
6014 case 2: /* replace */
6015 for (p = collstart; p < collend; ++p)
6016 *output++ = '?';
6017 /* fall through */
6018 case 3: /* ignore */
6019 p = collend;
6020 break;
6021 case 4: /* xmlcharrefreplace */
6022 /* generate replacement (temporarily (mis)uses p) */
6023 for (p = collstart; p < collend; ++p)
6024 output += sprintf(output, "&#%d;", (int)*p);
6025 p = collend;
6026 break;
6027 default:
6028 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6029 encoding, reason, s, length, &exc,
6030 collstart-s, collend-s, &newpos);
6031 if (repunicode == NULL)
6032 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006033 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006034 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006035 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6036 Py_DECREF(repunicode);
6037 goto onError;
6038 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* generate replacement */
6040 repsize = PyUnicode_GET_SIZE(repunicode);
6041 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6042 Py_UNICODE ch = *uni2;
6043 if (Py_UNICODE_ISSPACE(ch))
6044 *output++ = ' ';
6045 else {
6046 decimal = Py_UNICODE_TODECIMAL(ch);
6047 if (decimal >= 0)
6048 *output++ = '0' + decimal;
6049 else if (0 < ch && ch < 256)
6050 *output++ = (char)ch;
6051 else {
6052 Py_DECREF(repunicode);
6053 raise_encode_exception(&exc, encoding,
6054 s, length, collstart-s, collend-s, reason);
6055 goto onError;
6056 }
6057 }
6058 }
6059 p = s + newpos;
6060 Py_DECREF(repunicode);
6061 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006062 }
6063 /* 0-terminate the output string */
6064 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 Py_XDECREF(exc);
6066 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006067 return 0;
6068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 Py_XDECREF(exc);
6071 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006072 return -1;
6073}
6074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075/* --- Helpers ------------------------------------------------------------ */
6076
Eric Smith8c663262007-08-25 02:26:07 +00006077#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006079
Thomas Wouters477c8d52006-05-27 19:21:47 +00006080#include "stringlib/count.h"
6081#include "stringlib/find.h"
6082#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006083#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006084
Eric Smith5807c412008-05-11 21:00:57 +00006085#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006086#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006087#include "stringlib/localeutil.h"
6088
Thomas Wouters477c8d52006-05-27 19:21:47 +00006089/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006090#define ADJUST_INDICES(start, end, len) \
6091 if (end > len) \
6092 end = len; \
6093 else if (end < 0) { \
6094 end += len; \
6095 if (end < 0) \
6096 end = 0; \
6097 } \
6098 if (start < 0) { \
6099 start += len; \
6100 if (start < 0) \
6101 start = 0; \
6102 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006103
Martin v. Löwis18e16552006-02-15 17:27:45 +00006104Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006105 PyObject *substr,
6106 Py_ssize_t start,
6107 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006109 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006110 PyUnicodeObject* str_obj;
6111 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006112
Thomas Wouters477c8d52006-05-27 19:21:47 +00006113 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6114 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006116 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6117 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 Py_DECREF(str_obj);
6119 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
Tim Petersced69f82003-09-16 20:30:58 +00006121
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006122 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006124 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6125 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006126 );
6127
6128 Py_DECREF(sub_obj);
6129 Py_DECREF(str_obj);
6130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return result;
6132}
6133
Martin v. Löwis18e16552006-02-15 17:27:45 +00006134Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006135 PyObject *sub,
6136 Py_ssize_t start,
6137 Py_ssize_t end,
6138 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006143 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006145 sub = PyUnicode_FromObject(sub);
6146 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 Py_DECREF(str);
6148 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Tim Petersced69f82003-09-16 20:30:58 +00006150
Thomas Wouters477c8d52006-05-27 19:21:47 +00006151 if (direction > 0)
6152 result = stringlib_find_slice(
6153 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6154 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6155 start, end
6156 );
6157 else
6158 result = stringlib_rfind_slice(
6159 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6160 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6161 start, end
6162 );
6163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006165 Py_DECREF(sub);
6166
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 return result;
6168}
6169
Tim Petersced69f82003-09-16 20:30:58 +00006170static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 PyUnicodeObject *substring,
6173 Py_ssize_t start,
6174 Py_ssize_t end,
6175 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 if (substring->length == 0)
6178 return 1;
6179
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006180 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 end -= substring->length;
6182 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 if (Py_UNICODE_MATCH(self, end, substring))
6187 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 } else {
6189 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 }
6192
6193 return 0;
6194}
6195
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 PyObject *substr,
6198 Py_ssize_t start,
6199 Py_ssize_t end,
6200 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 str = PyUnicode_FromObject(str);
6205 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 substr = PyUnicode_FromObject(substr);
6208 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 Py_DECREF(str);
6210 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Tim Petersced69f82003-09-16 20:30:58 +00006212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 (PyUnicodeObject *)substr,
6215 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 Py_DECREF(str);
6217 Py_DECREF(substr);
6218 return result;
6219}
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221/* Apply fixfct filter to the Unicode object self and return a
6222 reference to the modified object */
6223
Tim Petersced69f82003-09-16 20:30:58 +00006224static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
6228
6229 PyUnicodeObject *u;
6230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006231 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006234
6235 Py_UNICODE_COPY(u->str, self->str, self->length);
6236
Tim Peters7a29bd52001-09-12 03:03:31 +00006237 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* fixfct should return TRUE if it modified the buffer. If
6239 FALSE, return a reference to the original buffer instead
6240 (to save space, not time) */
6241 Py_INCREF(self);
6242 Py_DECREF(u);
6243 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 }
6245 return (PyObject*) u;
6246}
6247
Tim Petersced69f82003-09-16 20:30:58 +00006248static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249int fixupper(PyUnicodeObject *self)
6250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006251 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 Py_UNICODE *s = self->str;
6253 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006257
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 ch = Py_UNICODE_TOUPPER(*s);
6259 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 *s = ch;
6262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 s++;
6264 }
6265
6266 return status;
6267}
6268
Tim Petersced69f82003-09-16 20:30:58 +00006269static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270int fixlower(PyUnicodeObject *self)
6271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006272 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 Py_UNICODE *s = self->str;
6274 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006278
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 ch = Py_UNICODE_TOLOWER(*s);
6280 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 *s = ch;
6283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 s++;
6285 }
6286
6287 return status;
6288}
6289
Tim Petersced69f82003-09-16 20:30:58 +00006290static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291int fixswapcase(PyUnicodeObject *self)
6292{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006293 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_UNICODE *s = self->str;
6295 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006296
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 while (len-- > 0) {
6298 if (Py_UNICODE_ISUPPER(*s)) {
6299 *s = Py_UNICODE_TOLOWER(*s);
6300 status = 1;
6301 } else if (Py_UNICODE_ISLOWER(*s)) {
6302 *s = Py_UNICODE_TOUPPER(*s);
6303 status = 1;
6304 }
6305 s++;
6306 }
6307
6308 return status;
6309}
6310
Tim Petersced69f82003-09-16 20:30:58 +00006311static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312int fixcapitalize(PyUnicodeObject *self)
6313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006314 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006315 Py_UNICODE *s = self->str;
6316 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006317
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006318 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006320 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 *s = Py_UNICODE_TOUPPER(*s);
6322 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006324 s++;
6325 while (--len > 0) {
6326 if (Py_UNICODE_ISUPPER(*s)) {
6327 *s = Py_UNICODE_TOLOWER(*s);
6328 status = 1;
6329 }
6330 s++;
6331 }
6332 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333}
6334
6335static
6336int fixtitle(PyUnicodeObject *self)
6337{
6338 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6339 register Py_UNICODE *e;
6340 int previous_is_cased;
6341
6342 /* Shortcut for single character strings */
6343 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6345 if (*p != ch) {
6346 *p = ch;
6347 return 1;
6348 }
6349 else
6350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 }
Tim Petersced69f82003-09-16 20:30:58 +00006352
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 e = p + PyUnicode_GET_SIZE(self);
6354 previous_is_cased = 0;
6355 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006357
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 if (previous_is_cased)
6359 *p = Py_UNICODE_TOLOWER(ch);
6360 else
6361 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006362
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 if (Py_UNICODE_ISLOWER(ch) ||
6364 Py_UNICODE_ISUPPER(ch) ||
6365 Py_UNICODE_ISTITLE(ch))
6366 previous_is_cased = 1;
6367 else
6368 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
6370 return 1;
6371}
6372
Tim Peters8ce9f162004-08-27 01:49:32 +00006373PyObject *
6374PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Skip Montanaro6543b452004-09-16 03:28:13 +00006376 const Py_UNICODE blank = ' ';
6377 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006379 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006380 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6381 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006382 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6383 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006384 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006385 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386
Tim Peters05eba1f2004-08-27 21:32:02 +00006387 fseq = PySequence_Fast(seq, "");
6388 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006389 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006390 }
6391
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006392 /* NOTE: the following code can't call back into Python code,
6393 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006394 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006395
Tim Peters05eba1f2004-08-27 21:32:02 +00006396 seqlen = PySequence_Fast_GET_SIZE(fseq);
6397 /* If empty sequence, return u"". */
6398 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006399 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6400 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006401 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006402 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006403 /* If singleton sequence with an exact Unicode, return that. */
6404 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 item = items[0];
6406 if (PyUnicode_CheckExact(item)) {
6407 Py_INCREF(item);
6408 res = (PyUnicodeObject *)item;
6409 goto Done;
6410 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006411 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006412 else {
6413 /* Set up sep and seplen */
6414 if (separator == NULL) {
6415 sep = &blank;
6416 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006417 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006418 else {
6419 if (!PyUnicode_Check(separator)) {
6420 PyErr_Format(PyExc_TypeError,
6421 "separator: expected str instance,"
6422 " %.80s found",
6423 Py_TYPE(separator)->tp_name);
6424 goto onError;
6425 }
6426 sep = PyUnicode_AS_UNICODE(separator);
6427 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006428 }
6429 }
6430
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006431 /* There are at least two things to join, or else we have a subclass
6432 * of str in the sequence.
6433 * Do a pre-pass to figure out the total amount of space we'll
6434 * need (sz), and see whether all argument are strings.
6435 */
6436 sz = 0;
6437 for (i = 0; i < seqlen; i++) {
6438 const Py_ssize_t old_sz = sz;
6439 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 if (!PyUnicode_Check(item)) {
6441 PyErr_Format(PyExc_TypeError,
6442 "sequence item %zd: expected str instance,"
6443 " %.80s found",
6444 i, Py_TYPE(item)->tp_name);
6445 goto onError;
6446 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006447 sz += PyUnicode_GET_SIZE(item);
6448 if (i != 0)
6449 sz += seplen;
6450 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6451 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006453 goto onError;
6454 }
6455 }
Tim Petersced69f82003-09-16 20:30:58 +00006456
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006457 res = _PyUnicode_New(sz);
6458 if (res == NULL)
6459 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006460
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006461 /* Catenate everything. */
6462 res_p = PyUnicode_AS_UNICODE(res);
6463 for (i = 0; i < seqlen; ++i) {
6464 Py_ssize_t itemlen;
6465 item = items[i];
6466 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 /* Copy item, and maybe the separator. */
6468 if (i) {
6469 Py_UNICODE_COPY(res_p, sep, seplen);
6470 res_p += seplen;
6471 }
6472 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6473 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006474 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006475
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006477 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return (PyObject *)res;
6479
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006481 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006482 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 return NULL;
6484}
6485
Tim Petersced69f82003-09-16 20:30:58 +00006486static
6487PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 Py_ssize_t left,
6489 Py_ssize_t right,
6490 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
6492 PyUnicodeObject *u;
6493
6494 if (left < 0)
6495 left = 0;
6496 if (right < 0)
6497 right = 0;
6498
Tim Peters7a29bd52001-09-12 03:03:31 +00006499 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 Py_INCREF(self);
6501 return self;
6502 }
6503
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006504 if (left > PY_SSIZE_T_MAX - self->length ||
6505 right > PY_SSIZE_T_MAX - (left + self->length)) {
6506 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6507 return NULL;
6508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 u = _PyUnicode_New(left + self->length + right);
6510 if (u) {
6511 if (left)
6512 Py_UNICODE_FILL(u->str, fill, left);
6513 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6514 if (right)
6515 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6516 }
6517
6518 return u;
6519}
6520
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006521PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
6525 string = PyUnicode_FromObject(string);
6526 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006529 list = stringlib_splitlines(
6530 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6531 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
6533 Py_DECREF(string);
6534 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
Tim Petersced69f82003-09-16 20:30:58 +00006537static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 PyUnicodeObject *substring,
6540 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006543 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006546 return stringlib_split_whitespace(
6547 (PyObject*) self, self->str, self->length, maxcount
6548 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006550 return stringlib_split(
6551 (PyObject*) self, self->str, self->length,
6552 substring->str, substring->length,
6553 maxcount
6554 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555}
6556
Tim Petersced69f82003-09-16 20:30:58 +00006557static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006558PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyUnicodeObject *substring,
6560 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006561{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006562 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006563 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006564
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006565 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006566 return stringlib_rsplit_whitespace(
6567 (PyObject*) self, self->str, self->length, maxcount
6568 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006569
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006570 return stringlib_rsplit(
6571 (PyObject*) self, self->str, self->length,
6572 substring->str, substring->length,
6573 maxcount
6574 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006575}
6576
6577static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 PyUnicodeObject *str1,
6580 PyUnicodeObject *str2,
6581 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
6583 PyUnicodeObject *u;
6584
6585 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006587 else if (maxcount == 0 || self->length == 0)
6588 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006591 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006593 if (str1->length == 0)
6594 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 if (str1->length == 1) {
6596 /* replace characters */
6597 Py_UNICODE u1, u2;
6598 if (!findchar(self->str, self->length, str1->str[0]))
6599 goto nothing;
6600 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6601 if (!u)
6602 return NULL;
6603 Py_UNICODE_COPY(u->str, self->str, self->length);
6604 u1 = str1->str[0];
6605 u2 = str2->str[0];
6606 for (i = 0; i < u->length; i++)
6607 if (u->str[i] == u1) {
6608 if (--maxcount < 0)
6609 break;
6610 u->str[i] = u2;
6611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006613 i = stringlib_find(
6614 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 if (i < 0)
6617 goto nothing;
6618 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6619 if (!u)
6620 return NULL;
6621 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006622
6623 /* change everything in-place, starting with this one */
6624 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6625 i += str1->length;
6626
6627 while ( --maxcount > 0) {
6628 i = stringlib_find(self->str+i, self->length-i,
6629 str1->str, str1->length,
6630 i);
6631 if (i == -1)
6632 break;
6633 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6634 i += str1->length;
6635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638
6639 Py_ssize_t n, i, j, e;
6640 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 Py_UNICODE *p;
6642
6643 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006644 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6645 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006646 if (n == 0)
6647 goto nothing;
6648 /* new_size = self->length + n * (str2->length - str1->length)); */
6649 delta = (str2->length - str1->length);
6650 if (delta == 0) {
6651 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006653 product = n * (str2->length - str1->length);
6654 if ((product / (str2->length - str1->length)) != n) {
6655 PyErr_SetString(PyExc_OverflowError,
6656 "replace string is too long");
6657 return NULL;
6658 }
6659 new_size = self->length + product;
6660 if (new_size < 0) {
6661 PyErr_SetString(PyExc_OverflowError,
6662 "replace string is too long");
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
6665 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006666 u = _PyUnicode_New(new_size);
6667 if (!u)
6668 return NULL;
6669 i = 0;
6670 p = u->str;
6671 e = self->length - str1->length;
6672 if (str1->length > 0) {
6673 while (n-- > 0) {
6674 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006675 j = stringlib_find(self->str+i, self->length-i,
6676 str1->str, str1->length,
6677 i);
6678 if (j == -1)
6679 break;
6680 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681 /* copy unchanged part [i:j] */
6682 Py_UNICODE_COPY(p, self->str+i, j-i);
6683 p += j - i;
6684 }
6685 /* copy substitution string */
6686 if (str2->length > 0) {
6687 Py_UNICODE_COPY(p, str2->str, str2->length);
6688 p += str2->length;
6689 }
6690 i = j + str1->length;
6691 }
6692 if (i < self->length)
6693 /* copy tail [i:] */
6694 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6695 } else {
6696 /* interleave */
6697 while (n > 0) {
6698 Py_UNICODE_COPY(p, str2->str, str2->length);
6699 p += str2->length;
6700 if (--n <= 0)
6701 break;
6702 *p++ = self->str[i++];
6703 }
6704 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006708
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006710 /* nothing to replace; return original string (when possible) */
6711 if (PyUnicode_CheckExact(self)) {
6712 Py_INCREF(self);
6713 return (PyObject *) self;
6714 }
6715 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716}
6717
6718/* --- Unicode Object Methods --------------------------------------------- */
6719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722\n\
6723Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006724characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
6726static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006727unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 return fixup(self, fixtitle);
6730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734\n\
6735Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006736have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
6738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006739unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return fixup(self, fixcapitalize);
6742}
6743
6744#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747\n\
6748Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006749normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750
6751static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006752unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
6754 PyObject *list;
6755 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 /* Split into words */
6759 list = split(self, NULL, -1);
6760 if (!list)
6761 return NULL;
6762
6763 /* Capitalize each word */
6764 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6765 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 if (item == NULL)
6768 goto onError;
6769 Py_DECREF(PyList_GET_ITEM(list, i));
6770 PyList_SET_ITEM(list, i, item);
6771 }
6772
6773 /* Join the words to form a new string */
6774 item = PyUnicode_Join(NULL, list);
6775
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 Py_DECREF(list);
6778 return (PyObject *)item;
6779}
6780#endif
6781
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006782/* Argument converter. Coerces to a single unicode character */
6783
6784static int
6785convert_uc(PyObject *obj, void *addr)
6786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006787 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6788 PyObject *uniobj;
6789 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006790
Benjamin Peterson14339b62009-01-31 16:36:08 +00006791 uniobj = PyUnicode_FromObject(obj);
6792 if (uniobj == NULL) {
6793 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006795 return 0;
6796 }
6797 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6798 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006800 Py_DECREF(uniobj);
6801 return 0;
6802 }
6803 unistr = PyUnicode_AS_UNICODE(uniobj);
6804 *fillcharloc = unistr[0];
6805 Py_DECREF(uniobj);
6806 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006807}
6808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006812Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006813done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
6815static PyObject *
6816unicode_center(PyUnicodeObject *self, PyObject *args)
6817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006818 Py_ssize_t marg, left;
6819 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006820 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
Thomas Woutersde017742006-02-16 19:34:37 +00006822 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 return NULL;
6824
Tim Peters7a29bd52001-09-12 03:03:31 +00006825 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 Py_INCREF(self);
6827 return (PyObject*) self;
6828 }
6829
6830 marg = width - self->length;
6831 left = marg / 2 + (marg & width & 1);
6832
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006833 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Marc-André Lemburge5034372000-08-08 08:04:29 +00006836#if 0
6837
6838/* This code should go into some future Unicode collation support
6839 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006840 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006842/* speedy UTF-16 code point order comparison */
6843/* gleaned from: */
6844/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6845
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006846static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006848 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006849 0, 0, 0, 0, 0, 0, 0, 0,
6850 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006851 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006852};
6853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854static int
6855unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6856{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006857 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 Py_UNICODE *s1 = str1->str;
6860 Py_UNICODE *s2 = str2->str;
6861
6862 len1 = str1->length;
6863 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006864
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006866 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006867
6868 c1 = *s1++;
6869 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006870
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 if (c1 > (1<<11) * 26)
6872 c1 += utf16Fixup[c1>>11];
6873 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006874 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006875 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006876
6877 if (c1 != c2)
6878 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006879
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006880 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 }
6882
6883 return (len1 < len2) ? -1 : (len1 != len2);
6884}
6885
Marc-André Lemburge5034372000-08-08 08:04:29 +00006886#else
6887
6888static int
6889unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006892
6893 Py_UNICODE *s1 = str1->str;
6894 Py_UNICODE *s2 = str2->str;
6895
6896 len1 = str1->length;
6897 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006898
Marc-André Lemburge5034372000-08-08 08:04:29 +00006899 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006900 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006901
Fredrik Lundh45714e92001-06-26 16:39:36 +00006902 c1 = *s1++;
6903 c2 = *s2++;
6904
6905 if (c1 != c2)
6906 return (c1 < c2) ? -1 : 1;
6907
Marc-André Lemburge5034372000-08-08 08:04:29 +00006908 len1--; len2--;
6909 }
6910
6911 return (len1 < len2) ? -1 : (len1 != len2);
6912}
6913
6914#endif
6915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006919 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6920 return unicode_compare((PyUnicodeObject *)left,
6921 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006922 PyErr_Format(PyExc_TypeError,
6923 "Can't compare %.100s and %.100s",
6924 left->ob_type->tp_name,
6925 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 return -1;
6927}
6928
Martin v. Löwis5b222132007-06-10 09:51:05 +00006929int
6930PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6931{
6932 int i;
6933 Py_UNICODE *id;
6934 assert(PyUnicode_Check(uni));
6935 id = PyUnicode_AS_UNICODE(uni);
6936 /* Compare Unicode string and source character set string */
6937 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 if (id[i] != str[i])
6939 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006940 /* This check keeps Python strings that end in '\0' from comparing equal
6941 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006942 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006944 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006946 return 0;
6947}
6948
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006949
Benjamin Peterson29060642009-01-31 22:14:21 +00006950#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006951 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006952
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006953PyObject *PyUnicode_RichCompare(PyObject *left,
6954 PyObject *right,
6955 int op)
6956{
6957 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006958
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006959 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6960 PyObject *v;
6961 if (((PyUnicodeObject *) left)->length !=
6962 ((PyUnicodeObject *) right)->length) {
6963 if (op == Py_EQ) {
6964 Py_INCREF(Py_False);
6965 return Py_False;
6966 }
6967 if (op == Py_NE) {
6968 Py_INCREF(Py_True);
6969 return Py_True;
6970 }
6971 }
6972 if (left == right)
6973 result = 0;
6974 else
6975 result = unicode_compare((PyUnicodeObject *)left,
6976 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006977
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006978 /* Convert the return value to a Boolean */
6979 switch (op) {
6980 case Py_EQ:
6981 v = TEST_COND(result == 0);
6982 break;
6983 case Py_NE:
6984 v = TEST_COND(result != 0);
6985 break;
6986 case Py_LE:
6987 v = TEST_COND(result <= 0);
6988 break;
6989 case Py_GE:
6990 v = TEST_COND(result >= 0);
6991 break;
6992 case Py_LT:
6993 v = TEST_COND(result == -1);
6994 break;
6995 case Py_GT:
6996 v = TEST_COND(result == 1);
6997 break;
6998 default:
6999 PyErr_BadArgument();
7000 return NULL;
7001 }
7002 Py_INCREF(v);
7003 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007004 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007006 Py_INCREF(Py_NotImplemented);
7007 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007008}
7009
Guido van Rossum403d68b2000-03-13 15:55:09 +00007010int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007012{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007014 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007015
7016 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 sub = PyUnicode_FromObject(element);
7018 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 PyErr_Format(PyExc_TypeError,
7020 "'in <string>' requires string as left operand, not %s",
7021 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007023 }
7024
Thomas Wouters477c8d52006-05-27 19:21:47 +00007025 str = PyUnicode_FromObject(container);
7026 if (!str) {
7027 Py_DECREF(sub);
7028 return -1;
7029 }
7030
7031 result = stringlib_contains_obj(str, sub);
7032
7033 Py_DECREF(str);
7034 Py_DECREF(sub);
7035
Guido van Rossum403d68b2000-03-13 15:55:09 +00007036 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007037}
7038
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039/* Concat to string or Unicode object giving a new Unicode object. */
7040
7041PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 PyUnicodeObject *u = NULL, *v = NULL, *w;
7045
7046 /* Coerce the two arguments */
7047 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7048 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7051 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054 /* Shortcuts */
7055 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 Py_DECREF(v);
7057 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 }
7059 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 Py_DECREF(u);
7061 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 }
7063
7064 /* Concat the two Unicode strings */
7065 w = _PyUnicode_New(u->length + v->length);
7066 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 Py_UNICODE_COPY(w->str, u->str, u->length);
7069 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7070
7071 Py_DECREF(u);
7072 Py_DECREF(v);
7073 return (PyObject *)w;
7074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 Py_XDECREF(u);
7077 Py_XDECREF(v);
7078 return NULL;
7079}
7080
Walter Dörwald1ab83302007-05-18 17:15:44 +00007081void
7082PyUnicode_Append(PyObject **pleft, PyObject *right)
7083{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 PyObject *new;
7085 if (*pleft == NULL)
7086 return;
7087 if (right == NULL || !PyUnicode_Check(*pleft)) {
7088 Py_DECREF(*pleft);
7089 *pleft = NULL;
7090 return;
7091 }
7092 new = PyUnicode_Concat(*pleft, right);
7093 Py_DECREF(*pleft);
7094 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007095}
7096
7097void
7098PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7099{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007100 PyUnicode_Append(pleft, right);
7101 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007107Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007108string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007109interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111static PyObject *
7112unicode_count(PyUnicodeObject *self, PyObject *args)
7113{
7114 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007115 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007116 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 PyObject *result;
7118
Guido van Rossumb8872e62000-05-09 14:14:27 +00007119 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 return NULL;
7122
7123 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007127
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007128 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007129 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007130 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007131 substring->str, substring->length,
7132 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007133 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134
7135 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 return result;
7138}
7139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007143Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007144to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007145handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7147'xmlcharrefreplace' as well as any other name registered with\n\
7148codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007151unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007153 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 char *encoding = NULL;
7155 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007156 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007157
Benjamin Peterson308d6372009-09-18 21:42:35 +00007158 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7159 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007161 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007162 if (v == NULL)
7163 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007164 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007165 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007166 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007167 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007168 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007169 Py_DECREF(v);
7170 return NULL;
7171 }
7172 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007173
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007175 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007176}
7177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180\n\
7181Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
7184static PyObject*
7185unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7186{
7187 Py_UNICODE *e;
7188 Py_UNICODE *p;
7189 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007190 Py_UNICODE *qe;
7191 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 PyUnicodeObject *u;
7193 int tabsize = 8;
7194
7195 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
Thomas Wouters7e474022000-07-16 12:04:32 +00007198 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007199 i = 0; /* chars up to and including most recent \n or \r */
7200 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7201 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 for (p = self->str; p < e; p++)
7203 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 if (tabsize > 0) {
7205 incr = tabsize - (j % tabsize); /* cannot overflow */
7206 if (j > PY_SSIZE_T_MAX - incr)
7207 goto overflow1;
7208 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 if (j > PY_SSIZE_T_MAX - 1)
7213 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 j++;
7215 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 if (i > PY_SSIZE_T_MAX - j)
7217 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007219 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 }
7221 }
7222
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007223 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007225
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 /* Second pass: create output string and fill it */
7227 u = _PyUnicode_New(i + j);
7228 if (!u)
7229 return NULL;
7230
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007231 j = 0; /* same as in first pass */
7232 q = u->str; /* next output char */
7233 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234
7235 for (p = self->str; p < e; p++)
7236 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 if (tabsize > 0) {
7238 i = tabsize - (j % tabsize);
7239 j += i;
7240 while (i--) {
7241 if (q >= qe)
7242 goto overflow2;
7243 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007244 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 else {
7248 if (q >= qe)
7249 goto overflow2;
7250 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007251 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 if (*p == '\n' || *p == '\r')
7253 j = 0;
7254 }
7255
7256 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007257
7258 overflow2:
7259 Py_DECREF(u);
7260 overflow1:
7261 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263}
7264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267\n\
7268Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007269such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270arguments start and end are interpreted as in slice notation.\n\
7271\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
7274static PyObject *
7275unicode_find(PyUnicodeObject *self, PyObject *args)
7276{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007277 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007278 Py_ssize_t start;
7279 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007280 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
Christian Heimes9cd17752007-11-18 19:35:23 +00007282 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
Thomas Wouters477c8d52006-05-27 19:21:47 +00007285 result = stringlib_find_slice(
7286 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7287 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7288 start, end
7289 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
7291 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007292
Christian Heimes217cfd12007-12-02 14:31:20 +00007293 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294}
7295
7296static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007297unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298{
7299 if (index < 0 || index >= self->length) {
7300 PyErr_SetString(PyExc_IndexError, "string index out of range");
7301 return NULL;
7302 }
7303
7304 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7305}
7306
Guido van Rossumc2504932007-09-18 19:42:40 +00007307/* Believe it or not, this produces the same value for ASCII strings
7308 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007310unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311{
Guido van Rossumc2504932007-09-18 19:42:40 +00007312 Py_ssize_t len;
7313 Py_UNICODE *p;
7314 long x;
7315
7316 if (self->hash != -1)
7317 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007318 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007319 p = self->str;
7320 x = *p << 7;
7321 while (--len >= 0)
7322 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007323 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007324 if (x == -1)
7325 x = -2;
7326 self->hash = x;
7327 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328}
7329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007330PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007333Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
7335static PyObject *
7336unicode_index(PyUnicodeObject *self, PyObject *args)
7337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007338 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007340 Py_ssize_t start;
7341 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342
Christian Heimes9cd17752007-11-18 19:35:23 +00007343 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
Thomas Wouters477c8d52006-05-27 19:21:47 +00007346 result = stringlib_find_slice(
7347 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7348 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7349 start, end
7350 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351
7352 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007353
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 if (result < 0) {
7355 PyErr_SetString(PyExc_ValueError, "substring not found");
7356 return NULL;
7357 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007358
Christian Heimes217cfd12007-12-02 14:31:20 +00007359 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360}
7361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007365Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007369unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370{
7371 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7372 register const Py_UNICODE *e;
7373 int cased;
7374
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 /* Shortcut for single character strings */
7376 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007379 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007380 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007382
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 e = p + PyUnicode_GET_SIZE(self);
7384 cased = 0;
7385 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007387
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7389 return PyBool_FromLong(0);
7390 else if (!cased && Py_UNICODE_ISLOWER(ch))
7391 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007393 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007399Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007400at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
7402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007403unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404{
7405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7406 register const Py_UNICODE *e;
7407 int cased;
7408
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 /* Shortcut for single character strings */
7410 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007414 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 e = p + PyUnicode_GET_SIZE(self);
7418 cased = 0;
7419 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007421
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7423 return PyBool_FromLong(0);
7424 else if (!cased && Py_UNICODE_ISUPPER(ch))
7425 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007427 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007433Return True if S is a titlecased string and there is at least one\n\
7434character in S, i.e. upper- and titlecase characters may only\n\
7435follow uncased characters and lowercase characters only cased ones.\n\
7436Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
7438static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007439unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440{
7441 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7442 register const Py_UNICODE *e;
7443 int cased, previous_is_cased;
7444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 /* Shortcut for single character strings */
7446 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7448 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007450 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007451 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 e = p + PyUnicode_GET_SIZE(self);
7455 cased = 0;
7456 previous_is_cased = 0;
7457 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007459
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7461 if (previous_is_cased)
7462 return PyBool_FromLong(0);
7463 previous_is_cased = 1;
7464 cased = 1;
7465 }
7466 else if (Py_UNICODE_ISLOWER(ch)) {
7467 if (!previous_is_cased)
7468 return PyBool_FromLong(0);
7469 previous_is_cased = 1;
7470 cased = 1;
7471 }
7472 else
7473 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007475 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476}
7477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007481Return True if all characters in S are whitespace\n\
7482and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
7484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007485unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486{
7487 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7488 register const Py_UNICODE *e;
7489
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 /* Shortcut for single character strings */
7491 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 Py_UNICODE_ISSPACE(*p))
7493 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007495 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007496 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007498
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 e = p + PyUnicode_GET_SIZE(self);
7500 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 if (!Py_UNICODE_ISSPACE(*p))
7502 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007504 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505}
7506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007509\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007510Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007511and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007512
7513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007514unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007515{
7516 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7517 register const Py_UNICODE *e;
7518
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007519 /* Shortcut for single character strings */
7520 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 Py_UNICODE_ISALPHA(*p))
7522 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007523
7524 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007525 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007527
7528 e = p + PyUnicode_GET_SIZE(self);
7529 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 if (!Py_UNICODE_ISALPHA(*p))
7531 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007532 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007533 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007534}
7535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007536PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007538\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007539Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541
7542static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007543unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007544{
7545 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7546 register const Py_UNICODE *e;
7547
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007548 /* Shortcut for single character strings */
7549 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 Py_UNICODE_ISALNUM(*p))
7551 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007552
7553 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007554 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007556
7557 e = p + PyUnicode_GET_SIZE(self);
7558 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 if (!Py_UNICODE_ISALNUM(*p))
7560 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007561 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007562 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007563}
7564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007565PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007568Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
7571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007572unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573{
7574 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7575 register const Py_UNICODE *e;
7576
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 /* Shortcut for single character strings */
7578 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 Py_UNICODE_ISDECIMAL(*p))
7580 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007582 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007583 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007585
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 e = p + PyUnicode_GET_SIZE(self);
7587 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (!Py_UNICODE_ISDECIMAL(*p))
7589 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007591 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592}
7593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007594PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007597Return True if all characters in S are digits\n\
7598and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
7600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007601unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602{
7603 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7604 register const Py_UNICODE *e;
7605
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 /* Shortcut for single character strings */
7607 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 Py_UNICODE_ISDIGIT(*p))
7609 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007611 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007612 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007614
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 e = p + PyUnicode_GET_SIZE(self);
7616 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 if (!Py_UNICODE_ISDIGIT(*p))
7618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621}
7622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007623PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007626Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007627False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
7629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007630unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631{
7632 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7633 register const Py_UNICODE *e;
7634
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 /* Shortcut for single character strings */
7636 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 Py_UNICODE_ISNUMERIC(*p))
7638 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007640 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007641 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007643
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 e = p + PyUnicode_GET_SIZE(self);
7645 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 if (!Py_UNICODE_ISNUMERIC(*p))
7647 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007649 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650}
7651
Martin v. Löwis47383402007-08-15 07:32:56 +00007652int
7653PyUnicode_IsIdentifier(PyObject *self)
7654{
7655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7656 register const Py_UNICODE *e;
7657
7658 /* Special case for empty strings */
7659 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007661
7662 /* PEP 3131 says that the first character must be in
7663 XID_Start and subsequent characters in XID_Continue,
7664 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007666 letters, digits, underscore). However, given the current
7667 definition of XID_Start and XID_Continue, it is sufficient
7668 to check just for these, except that _ must be allowed
7669 as starting an identifier. */
7670 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7671 return 0;
7672
7673 e = p + PyUnicode_GET_SIZE(self);
7674 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 if (!_PyUnicode_IsXidContinue(*p))
7676 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007677 }
7678 return 1;
7679}
7680
7681PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007683\n\
7684Return True if S is a valid identifier according\n\
7685to the language definition.");
7686
7687static PyObject*
7688unicode_isidentifier(PyObject *self)
7689{
7690 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7691}
7692
Georg Brandl559e5d72008-06-11 18:37:52 +00007693PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007695\n\
7696Return True if all characters in S are considered\n\
7697printable in repr() or S is empty, False otherwise.");
7698
7699static PyObject*
7700unicode_isprintable(PyObject *self)
7701{
7702 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7703 register const Py_UNICODE *e;
7704
7705 /* Shortcut for single character strings */
7706 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7707 Py_RETURN_TRUE;
7708 }
7709
7710 e = p + PyUnicode_GET_SIZE(self);
7711 for (; p < e; p++) {
7712 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7713 Py_RETURN_FALSE;
7714 }
7715 }
7716 Py_RETURN_TRUE;
7717}
7718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007719PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007720 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721\n\
7722Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007723iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
7725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007726unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007728 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729}
7730
Martin v. Löwis18e16552006-02-15 17:27:45 +00007731static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732unicode_length(PyUnicodeObject *self)
7733{
7734 return self->length;
7735}
7736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007737PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007740Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007741done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
7743static PyObject *
7744unicode_ljust(PyUnicodeObject *self, PyObject *args)
7745{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007746 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007747 Py_UNICODE fillchar = ' ';
7748
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007749 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 return NULL;
7751
Tim Peters7a29bd52001-09-12 03:03:31 +00007752 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 Py_INCREF(self);
7754 return (PyObject*) self;
7755 }
7756
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007757 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007763Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
7765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007766unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 return fixup(self, fixlower);
7769}
7770
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007771#define LEFTSTRIP 0
7772#define RIGHTSTRIP 1
7773#define BOTHSTRIP 2
7774
7775/* Arrays indexed by above */
7776static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7777
7778#define STRIPNAME(i) (stripformat[i]+3)
7779
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007780/* externally visible for str.strip(unicode) */
7781PyObject *
7782_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7785 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7786 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7787 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7788 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007789
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007791
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 i = 0;
7793 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7795 i++;
7796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007798
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 j = len;
7800 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 do {
7802 j--;
7803 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7804 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806
Benjamin Peterson14339b62009-01-31 16:36:08 +00007807 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 Py_INCREF(self);
7809 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007810 }
7811 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813}
7814
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
7816static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007817do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007819 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7820 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007821
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 i = 0;
7823 if (striptype != RIGHTSTRIP) {
7824 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7825 i++;
7826 }
7827 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007828
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 j = len;
7830 if (striptype != LEFTSTRIP) {
7831 do {
7832 j--;
7833 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7834 j++;
7835 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007836
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7838 Py_INCREF(self);
7839 return (PyObject*)self;
7840 }
7841 else
7842 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843}
7844
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007845
7846static PyObject *
7847do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007850
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7852 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 if (sep != NULL && sep != Py_None) {
7855 if (PyUnicode_Check(sep))
7856 return _PyUnicode_XStrip(self, striptype, sep);
7857 else {
7858 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 "%s arg must be None or str",
7860 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 return NULL;
7862 }
7863 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007864
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007866}
7867
7868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007869PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007871\n\
7872Return a copy of the string S with leading and trailing\n\
7873whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007874If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007875
7876static PyObject *
7877unicode_strip(PyUnicodeObject *self, PyObject *args)
7878{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007879 if (PyTuple_GET_SIZE(args) == 0)
7880 return do_strip(self, BOTHSTRIP); /* Common case */
7881 else
7882 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007883}
7884
7885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007886PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007888\n\
7889Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007890If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007891
7892static PyObject *
7893unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7894{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 if (PyTuple_GET_SIZE(args) == 0)
7896 return do_strip(self, LEFTSTRIP); /* Common case */
7897 else
7898 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007899}
7900
7901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007902PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007904\n\
7905Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007906If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007907
7908static PyObject *
7909unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007911 if (PyTuple_GET_SIZE(args) == 0)
7912 return do_strip(self, RIGHTSTRIP); /* Common case */
7913 else
7914 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007915}
7916
7917
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007919unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920{
7921 PyUnicodeObject *u;
7922 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007923 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007924 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925
Georg Brandl222de0f2009-04-12 12:01:50 +00007926 if (len < 1) {
7927 Py_INCREF(unicode_empty);
7928 return (PyObject *)unicode_empty;
7929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930
Tim Peters7a29bd52001-09-12 03:03:31 +00007931 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 /* no repeat, return original string */
7933 Py_INCREF(str);
7934 return (PyObject*) str;
7935 }
Tim Peters8f422462000-09-09 06:13:41 +00007936
7937 /* ensure # of chars needed doesn't overflow int and # of bytes
7938 * needed doesn't overflow size_t
7939 */
7940 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007941 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007942 PyErr_SetString(PyExc_OverflowError,
7943 "repeated string is too long");
7944 return NULL;
7945 }
7946 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7947 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7948 PyErr_SetString(PyExc_OverflowError,
7949 "repeated string is too long");
7950 return NULL;
7951 }
7952 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 if (!u)
7954 return NULL;
7955
7956 p = u->str;
7957
Georg Brandl222de0f2009-04-12 12:01:50 +00007958 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007959 Py_UNICODE_FILL(p, str->str[0], len);
7960 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007961 Py_ssize_t done = str->length; /* number of characters copied this far */
7962 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007964 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007965 Py_UNICODE_COPY(p+done, p, n);
7966 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
7969
7970 return (PyObject*) u;
7971}
7972
7973PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 PyObject *subobj,
7975 PyObject *replobj,
7976 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977{
7978 PyObject *self;
7979 PyObject *str1;
7980 PyObject *str2;
7981 PyObject *result;
7982
7983 self = PyUnicode_FromObject(obj);
7984 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 str1 = PyUnicode_FromObject(subobj);
7987 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_DECREF(self);
7989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 }
7991 str2 = PyUnicode_FromObject(replobj);
7992 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 Py_DECREF(self);
7994 Py_DECREF(str1);
7995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Tim Petersced69f82003-09-16 20:30:58 +00007997 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 (PyUnicodeObject *)str1,
7999 (PyUnicodeObject *)str2,
8000 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 Py_DECREF(self);
8002 Py_DECREF(str1);
8003 Py_DECREF(str2);
8004 return result;
8005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008008 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009\n\
8010Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008011old replaced by new. If the optional argument count is\n\
8012given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013
8014static PyObject*
8015unicode_replace(PyUnicodeObject *self, PyObject *args)
8016{
8017 PyUnicodeObject *str1;
8018 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 PyObject *result;
8021
Martin v. Löwis18e16552006-02-15 17:27:45 +00008022 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 return NULL;
8024 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8025 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008028 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 Py_DECREF(str1);
8030 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032
8033 result = replace(self, str1, str2, maxcount);
8034
8035 Py_DECREF(str1);
8036 Py_DECREF(str2);
8037 return result;
8038}
8039
8040static
8041PyObject *unicode_repr(PyObject *unicode)
8042{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008043 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008044 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008045 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8046 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8047
8048 /* XXX(nnorwitz): rather than over-allocating, it would be
8049 better to choose a different scheme. Perhaps scan the
8050 first N-chars of the string and allocate based on that size.
8051 */
8052 /* Initial allocation is based on the longest-possible unichr
8053 escape.
8054
8055 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8056 unichr, so in this case it's the longest unichr escape. In
8057 narrow (UTF-16) builds this is five chars per source unichr
8058 since there are two unichrs in the surrogate pair, so in narrow
8059 (UTF-16) builds it's not the longest unichr escape.
8060
8061 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8062 so in the narrow (UTF-16) build case it's the longest unichr
8063 escape.
8064 */
8065
Walter Dörwald1ab83302007-05-18 17:15:44 +00008066 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008068#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008070#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008072#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008074 if (repr == NULL)
8075 return NULL;
8076
Walter Dörwald1ab83302007-05-18 17:15:44 +00008077 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008078
8079 /* Add quote */
8080 *p++ = (findchar(s, size, '\'') &&
8081 !findchar(s, size, '"')) ? '"' : '\'';
8082 while (size-- > 0) {
8083 Py_UNICODE ch = *s++;
8084
8085 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008086 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008087 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008088 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008089 continue;
8090 }
8091
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008093 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008094 *p++ = '\\';
8095 *p++ = 't';
8096 }
8097 else if (ch == '\n') {
8098 *p++ = '\\';
8099 *p++ = 'n';
8100 }
8101 else if (ch == '\r') {
8102 *p++ = '\\';
8103 *p++ = 'r';
8104 }
8105
8106 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008107 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008108 *p++ = '\\';
8109 *p++ = 'x';
8110 *p++ = hexdigits[(ch >> 4) & 0x000F];
8111 *p++ = hexdigits[ch & 0x000F];
8112 }
8113
Georg Brandl559e5d72008-06-11 18:37:52 +00008114 /* Copy ASCII characters as-is */
8115 else if (ch < 0x7F) {
8116 *p++ = ch;
8117 }
8118
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008120 else {
8121 Py_UCS4 ucs = ch;
8122
8123#ifndef Py_UNICODE_WIDE
8124 Py_UNICODE ch2 = 0;
8125 /* Get code point from surrogate pair */
8126 if (size > 0) {
8127 ch2 = *s;
8128 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008133 size--;
8134 }
8135 }
8136#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008138 (categories Z* and C* except ASCII space)
8139 */
8140 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8141 /* Map 8-bit characters to '\xhh' */
8142 if (ucs <= 0xff) {
8143 *p++ = '\\';
8144 *p++ = 'x';
8145 *p++ = hexdigits[(ch >> 4) & 0x000F];
8146 *p++ = hexdigits[ch & 0x000F];
8147 }
8148 /* Map 21-bit characters to '\U00xxxxxx' */
8149 else if (ucs >= 0x10000) {
8150 *p++ = '\\';
8151 *p++ = 'U';
8152 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8153 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8154 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8155 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8156 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8157 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8158 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8159 *p++ = hexdigits[ucs & 0x0000000F];
8160 }
8161 /* Map 16-bit characters to '\uxxxx' */
8162 else {
8163 *p++ = '\\';
8164 *p++ = 'u';
8165 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8166 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8167 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8168 *p++ = hexdigits[ucs & 0x000F];
8169 }
8170 }
8171 /* Copy characters as-is */
8172 else {
8173 *p++ = ch;
8174#ifndef Py_UNICODE_WIDE
8175 if (ucs >= 0x10000)
8176 *p++ = ch2;
8177#endif
8178 }
8179 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008180 }
8181 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008182 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008183
8184 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008185 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187}
8188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008189PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191\n\
8192Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008193such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194arguments start and end are interpreted as in slice notation.\n\
8195\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008196Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197
8198static PyObject *
8199unicode_rfind(PyUnicodeObject *self, PyObject *args)
8200{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008201 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008202 Py_ssize_t start;
8203 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008204 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205
Christian Heimes9cd17752007-11-18 19:35:23 +00008206 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209 result = stringlib_rfind_slice(
8210 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8211 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8212 start, end
8213 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214
8215 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008216
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218}
8219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008220PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008223Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224
8225static PyObject *
8226unicode_rindex(PyUnicodeObject *self, PyObject *args)
8227{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008229 Py_ssize_t start;
8230 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
Christian Heimes9cd17752007-11-18 19:35:23 +00008233 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235
Thomas Wouters477c8d52006-05-27 19:21:47 +00008236 result = stringlib_rfind_slice(
8237 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8238 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8239 start, end
8240 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241
8242 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 if (result < 0) {
8245 PyErr_SetString(PyExc_ValueError, "substring not found");
8246 return NULL;
8247 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008248 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249}
8250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008251PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008254Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008255done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256
8257static PyObject *
8258unicode_rjust(PyUnicodeObject *self, PyObject *args)
8259{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008260 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008261 Py_UNICODE fillchar = ' ';
8262
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008263 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 return NULL;
8265
Tim Peters7a29bd52001-09-12 03:03:31 +00008266 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 Py_INCREF(self);
8268 return (PyObject*) self;
8269 }
8270
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008271 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272}
8273
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyObject *sep,
8276 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277{
8278 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008279
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 s = PyUnicode_FromObject(s);
8281 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008282 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (sep != NULL) {
8284 sep = PyUnicode_FromObject(sep);
8285 if (sep == NULL) {
8286 Py_DECREF(s);
8287 return NULL;
8288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
8290
8291 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8292
8293 Py_DECREF(s);
8294 Py_XDECREF(sep);
8295 return result;
8296}
8297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008298PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300\n\
8301Return a list of the words in S, using sep as the\n\
8302delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008303splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008304whitespace string is a separator and empty strings are\n\
8305removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
8307static PyObject*
8308unicode_split(PyUnicodeObject *self, PyObject *args)
8309{
8310 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008311 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
Martin v. Löwis18e16552006-02-15 17:27:45 +00008313 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 return NULL;
8315
8316 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322}
8323
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324PyObject *
8325PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8326{
8327 PyObject* str_obj;
8328 PyObject* sep_obj;
8329 PyObject* out;
8330
8331 str_obj = PyUnicode_FromObject(str_in);
8332 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008334 sep_obj = PyUnicode_FromObject(sep_in);
8335 if (!sep_obj) {
8336 Py_DECREF(str_obj);
8337 return NULL;
8338 }
8339
8340 out = stringlib_partition(
8341 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8342 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8343 );
8344
8345 Py_DECREF(sep_obj);
8346 Py_DECREF(str_obj);
8347
8348 return out;
8349}
8350
8351
8352PyObject *
8353PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8354{
8355 PyObject* str_obj;
8356 PyObject* sep_obj;
8357 PyObject* out;
8358
8359 str_obj = PyUnicode_FromObject(str_in);
8360 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008362 sep_obj = PyUnicode_FromObject(sep_in);
8363 if (!sep_obj) {
8364 Py_DECREF(str_obj);
8365 return NULL;
8366 }
8367
8368 out = stringlib_rpartition(
8369 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8370 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8371 );
8372
8373 Py_DECREF(sep_obj);
8374 Py_DECREF(str_obj);
8375
8376 return out;
8377}
8378
8379PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008381\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008382Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008384found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385
8386static PyObject*
8387unicode_partition(PyUnicodeObject *self, PyObject *separator)
8388{
8389 return PyUnicode_Partition((PyObject *)self, separator);
8390}
8391
8392PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008393 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008395Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008397separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008398
8399static PyObject*
8400unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8401{
8402 return PyUnicode_RPartition((PyObject *)self, separator);
8403}
8404
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008405PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 PyObject *sep,
8407 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008408{
8409 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008411 s = PyUnicode_FromObject(s);
8412 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 if (sep != NULL) {
8415 sep = PyUnicode_FromObject(sep);
8416 if (sep == NULL) {
8417 Py_DECREF(s);
8418 return NULL;
8419 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008420 }
8421
8422 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8423
8424 Py_DECREF(s);
8425 Py_XDECREF(sep);
8426 return result;
8427}
8428
8429PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008431\n\
8432Return a list of the words in S, using sep as the\n\
8433delimiter string, starting at the end of the string and\n\
8434working to the front. If maxsplit is given, at most maxsplit\n\
8435splits are done. If sep is not specified, any whitespace string\n\
8436is a separator.");
8437
8438static PyObject*
8439unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8440{
8441 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008442 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008443
Martin v. Löwis18e16552006-02-15 17:27:45 +00008444 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008445 return NULL;
8446
8447 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008449 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008451 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008453}
8454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008455PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457\n\
8458Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008459Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008460is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461
8462static PyObject*
8463unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8464{
Guido van Rossum86662912000-04-11 15:38:46 +00008465 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
Guido van Rossum86662912000-04-11 15:38:46 +00008467 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 return NULL;
8469
Guido van Rossum86662912000-04-11 15:38:46 +00008470 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471}
8472
8473static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008474PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475{
Walter Dörwald346737f2007-05-31 10:44:43 +00008476 if (PyUnicode_CheckExact(self)) {
8477 Py_INCREF(self);
8478 return self;
8479 } else
8480 /* Subtype -- return genuine unicode string with the same value. */
8481 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8482 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483}
8484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008485PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487\n\
8488Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008489and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490
8491static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008492unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 return fixup(self, fixswapcase);
8495}
8496
Georg Brandlceee0772007-11-27 23:48:05 +00008497PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008499\n\
8500Return a translation table usable for str.translate().\n\
8501If there is only one argument, it must be a dictionary mapping Unicode\n\
8502ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008503Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008504If there are two arguments, they must be strings of equal length, and\n\
8505in the resulting dictionary, each character in x will be mapped to the\n\
8506character at the same position in y. If there is a third argument, it\n\
8507must be a string, whose characters will be mapped to None in the result.");
8508
8509static PyObject*
8510unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8511{
8512 PyObject *x, *y = NULL, *z = NULL;
8513 PyObject *new = NULL, *key, *value;
8514 Py_ssize_t i = 0;
8515 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516
Georg Brandlceee0772007-11-27 23:48:05 +00008517 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8518 return NULL;
8519 new = PyDict_New();
8520 if (!new)
8521 return NULL;
8522 if (y != NULL) {
8523 /* x must be a string too, of equal length */
8524 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8525 if (!PyUnicode_Check(x)) {
8526 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8527 "be a string if there is a second argument");
8528 goto err;
8529 }
8530 if (PyUnicode_GET_SIZE(x) != ylen) {
8531 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8532 "arguments must have equal length");
8533 goto err;
8534 }
8535 /* create entries for translating chars in x to those in y */
8536 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008537 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8538 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008539 if (!key || !value)
8540 goto err;
8541 res = PyDict_SetItem(new, key, value);
8542 Py_DECREF(key);
8543 Py_DECREF(value);
8544 if (res < 0)
8545 goto err;
8546 }
8547 /* create entries for deleting chars in z */
8548 if (z != NULL) {
8549 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008550 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008551 if (!key)
8552 goto err;
8553 res = PyDict_SetItem(new, key, Py_None);
8554 Py_DECREF(key);
8555 if (res < 0)
8556 goto err;
8557 }
8558 }
8559 } else {
8560 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008561 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008562 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8563 "to maketrans it must be a dict");
8564 goto err;
8565 }
8566 /* copy entries into the new dict, converting string keys to int keys */
8567 while (PyDict_Next(x, &i, &key, &value)) {
8568 if (PyUnicode_Check(key)) {
8569 /* convert string keys to integer keys */
8570 PyObject *newkey;
8571 if (PyUnicode_GET_SIZE(key) != 1) {
8572 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8573 "table must be of length 1");
8574 goto err;
8575 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008576 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008577 if (!newkey)
8578 goto err;
8579 res = PyDict_SetItem(new, newkey, value);
8580 Py_DECREF(newkey);
8581 if (res < 0)
8582 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008583 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008584 /* just keep integer keys */
8585 if (PyDict_SetItem(new, key, value) < 0)
8586 goto err;
8587 } else {
8588 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8589 "be strings or integers");
8590 goto err;
8591 }
8592 }
8593 }
8594 return new;
8595 err:
8596 Py_DECREF(new);
8597 return NULL;
8598}
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
8603Return a copy of the string S, where all characters have been mapped\n\
8604through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008605Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008606Unmapped characters are left untouched. Characters mapped to None\n\
8607are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
8609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008610unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Georg Brandlceee0772007-11-27 23:48:05 +00008612 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613}
8614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008615PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008618Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
8620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008621unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 return fixup(self, fixupper);
8624}
8625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008626PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008629Pad a numeric string S with zeros on the left, to fill a field\n\
8630of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631
8632static PyObject *
8633unicode_zfill(PyUnicodeObject *self, PyObject *args)
8634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008635 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 PyUnicodeObject *u;
8637
Martin v. Löwis18e16552006-02-15 17:27:45 +00008638 Py_ssize_t width;
8639 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 return NULL;
8641
8642 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008643 if (PyUnicode_CheckExact(self)) {
8644 Py_INCREF(self);
8645 return (PyObject*) self;
8646 }
8647 else
8648 return PyUnicode_FromUnicode(
8649 PyUnicode_AS_UNICODE(self),
8650 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
8653
8654 fill = width - self->length;
8655
8656 u = pad(self, fill, 0, '0');
8657
Walter Dörwald068325e2002-04-15 13:36:47 +00008658 if (u == NULL)
8659 return NULL;
8660
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 if (u->str[fill] == '+' || u->str[fill] == '-') {
8662 /* move sign to beginning of string */
8663 u->str[0] = u->str[fill];
8664 u->str[fill] = '0';
8665 }
8666
8667 return (PyObject*) u;
8668}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669
8670#if 0
8671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008672unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673{
Christian Heimes2202f872008-02-06 14:31:34 +00008674 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675}
8676#endif
8677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008678PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008681Return True if S starts with the specified prefix, False otherwise.\n\
8682With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683With optional end, stop comparing S at that position.\n\
8684prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
8686static PyObject *
8687unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008690 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008693 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008694 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008696 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8698 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008699 if (PyTuple_Check(subobj)) {
8700 Py_ssize_t i;
8701 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8702 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008704 if (substring == NULL)
8705 return NULL;
8706 result = tailmatch(self, substring, start, end, -1);
8707 Py_DECREF(substring);
8708 if (result) {
8709 Py_RETURN_TRUE;
8710 }
8711 }
8712 /* nothing matched */
8713 Py_RETURN_FALSE;
8714 }
8715 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008718 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008720 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721}
8722
8723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008724PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008727Return True if S ends with the specified suffix, False otherwise.\n\
8728With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008729With optional end, stop comparing S at that position.\n\
8730suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731
8732static PyObject *
8733unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008736 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008738 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008739 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008740 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008742 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8744 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008745 if (PyTuple_Check(subobj)) {
8746 Py_ssize_t i;
8747 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8748 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008750 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008752 result = tailmatch(self, substring, start, end, +1);
8753 Py_DECREF(substring);
8754 if (result) {
8755 Py_RETURN_TRUE;
8756 }
8757 }
8758 Py_RETURN_FALSE;
8759 }
8760 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008764 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008766 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767}
8768
Eric Smith8c663262007-08-25 02:26:07 +00008769#include "stringlib/string_format.h"
8770
8771PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008773\n\
8774");
8775
Eric Smith4a7d76d2008-05-30 18:10:19 +00008776static PyObject *
8777unicode__format__(PyObject* self, PyObject* args)
8778{
8779 PyObject *format_spec;
8780
8781 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8782 return NULL;
8783
8784 return _PyUnicode_FormatAdvanced(self,
8785 PyUnicode_AS_UNICODE(format_spec),
8786 PyUnicode_GET_SIZE(format_spec));
8787}
8788
Eric Smith8c663262007-08-25 02:26:07 +00008789PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008791\n\
8792");
8793
8794static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008795unicode__sizeof__(PyUnicodeObject *v)
8796{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008797 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8798 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008799}
8800
8801PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008803
8804static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008805unicode_getnewargs(PyUnicodeObject *v)
8806{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008807 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008808}
8809
8810
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811static PyMethodDef unicode_methods[] = {
8812
8813 /* Order is according to common usage: often used methods should
8814 appear first, since lookup is done sequentially. */
8815
Benjamin Peterson308d6372009-09-18 21:42:35 +00008816 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008817 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8818 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008819 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008820 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8821 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8822 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8823 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8824 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8825 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8826 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008827 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008828 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8829 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8830 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008831 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008832 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8833 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8834 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008835 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008836 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008837 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008838 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008839 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8840 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8841 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8842 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8843 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8844 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8845 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8846 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8847 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8848 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8849 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8850 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8851 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8852 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008853 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008854 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008855 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008856 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008857 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008858 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8859 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008860 {"maketrans", (PyCFunction) unicode_maketrans,
8861 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008862 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008863#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008864 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865#endif
8866
8867#if 0
8868 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008869 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870#endif
8871
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 {NULL, NULL}
8874};
8875
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008876static PyObject *
8877unicode_mod(PyObject *v, PyObject *w)
8878{
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 if (!PyUnicode_Check(v)) {
8880 Py_INCREF(Py_NotImplemented);
8881 return Py_NotImplemented;
8882 }
8883 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008884}
8885
8886static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 0, /*nb_add*/
8888 0, /*nb_subtract*/
8889 0, /*nb_multiply*/
8890 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891};
8892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008894 (lenfunc) unicode_length, /* sq_length */
8895 PyUnicode_Concat, /* sq_concat */
8896 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8897 (ssizeargfunc) unicode_getitem, /* sq_item */
8898 0, /* sq_slice */
8899 0, /* sq_ass_item */
8900 0, /* sq_ass_slice */
8901 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902};
8903
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008904static PyObject*
8905unicode_subscript(PyUnicodeObject* self, PyObject* item)
8906{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008907 if (PyIndex_Check(item)) {
8908 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008909 if (i == -1 && PyErr_Occurred())
8910 return NULL;
8911 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008912 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008913 return unicode_getitem(self, i);
8914 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008915 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008916 Py_UNICODE* source_buf;
8917 Py_UNICODE* result_buf;
8918 PyObject* result;
8919
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008920 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008922 return NULL;
8923 }
8924
8925 if (slicelength <= 0) {
8926 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008927 } else if (start == 0 && step == 1 && slicelength == self->length &&
8928 PyUnicode_CheckExact(self)) {
8929 Py_INCREF(self);
8930 return (PyObject *)self;
8931 } else if (step == 1) {
8932 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008933 } else {
8934 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008935 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8936 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008937
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 if (result_buf == NULL)
8939 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008940
8941 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8942 result_buf[i] = source_buf[cur];
8943 }
Tim Petersced69f82003-09-16 20:30:58 +00008944
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008945 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008946 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008947 return result;
8948 }
8949 } else {
8950 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8951 return NULL;
8952 }
8953}
8954
8955static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008956 (lenfunc)unicode_length, /* mp_length */
8957 (binaryfunc)unicode_subscript, /* mp_subscript */
8958 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008959};
8960
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962/* Helpers for PyUnicode_Format() */
8963
8964static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008965getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008967 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 (*p_argidx)++;
8970 if (arglen < 0)
8971 return args;
8972 else
8973 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 }
8975 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 return NULL;
8978}
8979
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008980/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008982static PyObject *
8983formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008985 char *p;
8986 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008988
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 x = PyFloat_AsDouble(v);
8990 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008991 return NULL;
8992
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008995
Eric Smith0923d1d2009-04-16 20:16:10 +00008996 p = PyOS_double_to_string(x, type, prec,
8997 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008998 if (p == NULL)
8999 return NULL;
9000 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009001 PyMem_Free(p);
9002 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003}
9004
Tim Peters38fd5b62000-09-21 05:43:11 +00009005static PyObject*
9006formatlong(PyObject *val, int flags, int prec, int type)
9007{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 char *buf;
9009 int len;
9010 PyObject *str; /* temporary string object. */
9011 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009012
Benjamin Peterson14339b62009-01-31 16:36:08 +00009013 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9014 if (!str)
9015 return NULL;
9016 result = PyUnicode_FromStringAndSize(buf, len);
9017 Py_DECREF(str);
9018 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009019}
9020
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021static int
9022formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009023 size_t buflen,
9024 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009026 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009027 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 if (PyUnicode_GET_SIZE(v) == 1) {
9029 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9030 buf[1] = '\0';
9031 return 1;
9032 }
9033#ifndef Py_UNICODE_WIDE
9034 if (PyUnicode_GET_SIZE(v) == 2) {
9035 /* Decode a valid surrogate pair */
9036 int c0 = PyUnicode_AS_UNICODE(v)[0];
9037 int c1 = PyUnicode_AS_UNICODE(v)[1];
9038 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9039 0xDC00 <= c1 && c1 <= 0xDFFF) {
9040 buf[0] = c0;
9041 buf[1] = c1;
9042 buf[2] = '\0';
9043 return 2;
9044 }
9045 }
9046#endif
9047 goto onError;
9048 }
9049 else {
9050 /* Integer input truncated to a character */
9051 long x;
9052 x = PyLong_AsLong(v);
9053 if (x == -1 && PyErr_Occurred())
9054 goto onError;
9055
9056 if (x < 0 || x > 0x10ffff) {
9057 PyErr_SetString(PyExc_OverflowError,
9058 "%c arg not in range(0x110000)");
9059 return -1;
9060 }
9061
9062#ifndef Py_UNICODE_WIDE
9063 if (x > 0xffff) {
9064 x -= 0x10000;
9065 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9066 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9067 return 2;
9068 }
9069#endif
9070 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009071 buf[1] = '\0';
9072 return 1;
9073 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009074
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009076 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009078 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079}
9080
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009081/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009082 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009083*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009084#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009085
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088{
9089 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009090 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 int args_owned = 0;
9092 PyUnicodeObject *result = NULL;
9093 PyObject *dict = NULL;
9094 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009095
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 PyErr_BadInternalCall();
9098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 }
9100 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009101 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 fmt = PyUnicode_AS_UNICODE(uformat);
9104 fmtcnt = PyUnicode_GET_SIZE(uformat);
9105
9106 reslen = rescnt = fmtcnt + 100;
9107 result = _PyUnicode_New(reslen);
9108 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 res = PyUnicode_AS_UNICODE(result);
9111
9112 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 arglen = PyTuple_Size(args);
9114 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 }
9116 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 arglen = -1;
9118 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009120 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009121 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
9124 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 if (*fmt != '%') {
9126 if (--rescnt < 0) {
9127 rescnt = fmtcnt + 100;
9128 reslen += rescnt;
9129 if (_PyUnicode_Resize(&result, reslen) < 0)
9130 goto onError;
9131 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9132 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009135 }
9136 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 /* Got a format specifier */
9138 int flags = 0;
9139 Py_ssize_t width = -1;
9140 int prec = -1;
9141 Py_UNICODE c = '\0';
9142 Py_UNICODE fill;
9143 int isnumok;
9144 PyObject *v = NULL;
9145 PyObject *temp = NULL;
9146 Py_UNICODE *pbuf;
9147 Py_UNICODE sign;
9148 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009149 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 fmt++;
9152 if (*fmt == '(') {
9153 Py_UNICODE *keystart;
9154 Py_ssize_t keylen;
9155 PyObject *key;
9156 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009157
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 if (dict == NULL) {
9159 PyErr_SetString(PyExc_TypeError,
9160 "format requires a mapping");
9161 goto onError;
9162 }
9163 ++fmt;
9164 --fmtcnt;
9165 keystart = fmt;
9166 /* Skip over balanced parentheses */
9167 while (pcount > 0 && --fmtcnt >= 0) {
9168 if (*fmt == ')')
9169 --pcount;
9170 else if (*fmt == '(')
9171 ++pcount;
9172 fmt++;
9173 }
9174 keylen = fmt - keystart - 1;
9175 if (fmtcnt < 0 || pcount > 0) {
9176 PyErr_SetString(PyExc_ValueError,
9177 "incomplete format key");
9178 goto onError;
9179 }
9180#if 0
9181 /* keys are converted to strings using UTF-8 and
9182 then looked up since Python uses strings to hold
9183 variables names etc. in its namespaces and we
9184 wouldn't want to break common idioms. */
9185 key = PyUnicode_EncodeUTF8(keystart,
9186 keylen,
9187 NULL);
9188#else
9189 key = PyUnicode_FromUnicode(keystart, keylen);
9190#endif
9191 if (key == NULL)
9192 goto onError;
9193 if (args_owned) {
9194 Py_DECREF(args);
9195 args_owned = 0;
9196 }
9197 args = PyObject_GetItem(dict, key);
9198 Py_DECREF(key);
9199 if (args == NULL) {
9200 goto onError;
9201 }
9202 args_owned = 1;
9203 arglen = -1;
9204 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009205 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 while (--fmtcnt >= 0) {
9207 switch (c = *fmt++) {
9208 case '-': flags |= F_LJUST; continue;
9209 case '+': flags |= F_SIGN; continue;
9210 case ' ': flags |= F_BLANK; continue;
9211 case '#': flags |= F_ALT; continue;
9212 case '0': flags |= F_ZERO; continue;
9213 }
9214 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009215 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 if (c == '*') {
9217 v = getnextarg(args, arglen, &argidx);
9218 if (v == NULL)
9219 goto onError;
9220 if (!PyLong_Check(v)) {
9221 PyErr_SetString(PyExc_TypeError,
9222 "* wants int");
9223 goto onError;
9224 }
9225 width = PyLong_AsLong(v);
9226 if (width == -1 && PyErr_Occurred())
9227 goto onError;
9228 if (width < 0) {
9229 flags |= F_LJUST;
9230 width = -width;
9231 }
9232 if (--fmtcnt >= 0)
9233 c = *fmt++;
9234 }
9235 else if (c >= '0' && c <= '9') {
9236 width = c - '0';
9237 while (--fmtcnt >= 0) {
9238 c = *fmt++;
9239 if (c < '0' || c > '9')
9240 break;
9241 if ((width*10) / 10 != width) {
9242 PyErr_SetString(PyExc_ValueError,
9243 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009244 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 }
9246 width = width*10 + (c - '0');
9247 }
9248 }
9249 if (c == '.') {
9250 prec = 0;
9251 if (--fmtcnt >= 0)
9252 c = *fmt++;
9253 if (c == '*') {
9254 v = getnextarg(args, arglen, &argidx);
9255 if (v == NULL)
9256 goto onError;
9257 if (!PyLong_Check(v)) {
9258 PyErr_SetString(PyExc_TypeError,
9259 "* wants int");
9260 goto onError;
9261 }
9262 prec = PyLong_AsLong(v);
9263 if (prec == -1 && PyErr_Occurred())
9264 goto onError;
9265 if (prec < 0)
9266 prec = 0;
9267 if (--fmtcnt >= 0)
9268 c = *fmt++;
9269 }
9270 else if (c >= '0' && c <= '9') {
9271 prec = c - '0';
9272 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009273 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 if (c < '0' || c > '9')
9275 break;
9276 if ((prec*10) / 10 != prec) {
9277 PyErr_SetString(PyExc_ValueError,
9278 "prec too big");
9279 goto onError;
9280 }
9281 prec = prec*10 + (c - '0');
9282 }
9283 }
9284 } /* prec */
9285 if (fmtcnt >= 0) {
9286 if (c == 'h' || c == 'l' || c == 'L') {
9287 if (--fmtcnt >= 0)
9288 c = *fmt++;
9289 }
9290 }
9291 if (fmtcnt < 0) {
9292 PyErr_SetString(PyExc_ValueError,
9293 "incomplete format");
9294 goto onError;
9295 }
9296 if (c != '%') {
9297 v = getnextarg(args, arglen, &argidx);
9298 if (v == NULL)
9299 goto onError;
9300 }
9301 sign = 0;
9302 fill = ' ';
9303 switch (c) {
9304
9305 case '%':
9306 pbuf = formatbuf;
9307 /* presume that buffer length is at least 1 */
9308 pbuf[0] = '%';
9309 len = 1;
9310 break;
9311
9312 case 's':
9313 case 'r':
9314 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009315 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 temp = v;
9317 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009318 }
9319 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 if (c == 's')
9321 temp = PyObject_Str(v);
9322 else if (c == 'r')
9323 temp = PyObject_Repr(v);
9324 else
9325 temp = PyObject_ASCII(v);
9326 if (temp == NULL)
9327 goto onError;
9328 if (PyUnicode_Check(temp))
9329 /* nothing to do */;
9330 else {
9331 Py_DECREF(temp);
9332 PyErr_SetString(PyExc_TypeError,
9333 "%s argument has non-string str()");
9334 goto onError;
9335 }
9336 }
9337 pbuf = PyUnicode_AS_UNICODE(temp);
9338 len = PyUnicode_GET_SIZE(temp);
9339 if (prec >= 0 && len > prec)
9340 len = prec;
9341 break;
9342
9343 case 'i':
9344 case 'd':
9345 case 'u':
9346 case 'o':
9347 case 'x':
9348 case 'X':
9349 if (c == 'i')
9350 c = 'd';
9351 isnumok = 0;
9352 if (PyNumber_Check(v)) {
9353 PyObject *iobj=NULL;
9354
9355 if (PyLong_Check(v)) {
9356 iobj = v;
9357 Py_INCREF(iobj);
9358 }
9359 else {
9360 iobj = PyNumber_Long(v);
9361 }
9362 if (iobj!=NULL) {
9363 if (PyLong_Check(iobj)) {
9364 isnumok = 1;
9365 temp = formatlong(iobj, flags, prec, c);
9366 Py_DECREF(iobj);
9367 if (!temp)
9368 goto onError;
9369 pbuf = PyUnicode_AS_UNICODE(temp);
9370 len = PyUnicode_GET_SIZE(temp);
9371 sign = 1;
9372 }
9373 else {
9374 Py_DECREF(iobj);
9375 }
9376 }
9377 }
9378 if (!isnumok) {
9379 PyErr_Format(PyExc_TypeError,
9380 "%%%c format: a number is required, "
9381 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9382 goto onError;
9383 }
9384 if (flags & F_ZERO)
9385 fill = '0';
9386 break;
9387
9388 case 'e':
9389 case 'E':
9390 case 'f':
9391 case 'F':
9392 case 'g':
9393 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009394 temp = formatfloat(v, flags, prec, c);
9395 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009397 pbuf = PyUnicode_AS_UNICODE(temp);
9398 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 sign = 1;
9400 if (flags & F_ZERO)
9401 fill = '0';
9402 break;
9403
9404 case 'c':
9405 pbuf = formatbuf;
9406 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9407 if (len < 0)
9408 goto onError;
9409 break;
9410
9411 default:
9412 PyErr_Format(PyExc_ValueError,
9413 "unsupported format character '%c' (0x%x) "
9414 "at index %zd",
9415 (31<=c && c<=126) ? (char)c : '?',
9416 (int)c,
9417 (Py_ssize_t)(fmt - 1 -
9418 PyUnicode_AS_UNICODE(uformat)));
9419 goto onError;
9420 }
9421 if (sign) {
9422 if (*pbuf == '-' || *pbuf == '+') {
9423 sign = *pbuf++;
9424 len--;
9425 }
9426 else if (flags & F_SIGN)
9427 sign = '+';
9428 else if (flags & F_BLANK)
9429 sign = ' ';
9430 else
9431 sign = 0;
9432 }
9433 if (width < len)
9434 width = len;
9435 if (rescnt - (sign != 0) < width) {
9436 reslen -= rescnt;
9437 rescnt = width + fmtcnt + 100;
9438 reslen += rescnt;
9439 if (reslen < 0) {
9440 Py_XDECREF(temp);
9441 PyErr_NoMemory();
9442 goto onError;
9443 }
9444 if (_PyUnicode_Resize(&result, reslen) < 0) {
9445 Py_XDECREF(temp);
9446 goto onError;
9447 }
9448 res = PyUnicode_AS_UNICODE(result)
9449 + reslen - rescnt;
9450 }
9451 if (sign) {
9452 if (fill != ' ')
9453 *res++ = sign;
9454 rescnt--;
9455 if (width > len)
9456 width--;
9457 }
9458 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9459 assert(pbuf[0] == '0');
9460 assert(pbuf[1] == c);
9461 if (fill != ' ') {
9462 *res++ = *pbuf++;
9463 *res++ = *pbuf++;
9464 }
9465 rescnt -= 2;
9466 width -= 2;
9467 if (width < 0)
9468 width = 0;
9469 len -= 2;
9470 }
9471 if (width > len && !(flags & F_LJUST)) {
9472 do {
9473 --rescnt;
9474 *res++ = fill;
9475 } while (--width > len);
9476 }
9477 if (fill == ' ') {
9478 if (sign)
9479 *res++ = sign;
9480 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9481 assert(pbuf[0] == '0');
9482 assert(pbuf[1] == c);
9483 *res++ = *pbuf++;
9484 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 }
9486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 Py_UNICODE_COPY(res, pbuf, len);
9488 res += len;
9489 rescnt -= len;
9490 while (--width >= len) {
9491 --rescnt;
9492 *res++ = ' ';
9493 }
9494 if (dict && (argidx < arglen) && c != '%') {
9495 PyErr_SetString(PyExc_TypeError,
9496 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009497 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 goto onError;
9499 }
9500 Py_XDECREF(temp);
9501 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 } /* until end */
9503 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 PyErr_SetString(PyExc_TypeError,
9505 "not all arguments converted during string formatting");
9506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
9508
Thomas Woutersa96affe2006-03-12 00:29:36 +00009509 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
9514 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 return (PyObject *)result;
9516
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 Py_XDECREF(result);
9519 Py_DECREF(uformat);
9520 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 }
9523 return NULL;
9524}
9525
Jeremy Hylton938ace62002-07-17 16:30:39 +00009526static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009527unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9528
Tim Peters6d6c1a32001-08-02 04:15:00 +00009529static PyObject *
9530unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9531{
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 static char *kwlist[] = {"object", "encoding", "errors", 0};
9534 char *encoding = NULL;
9535 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009536
Benjamin Peterson14339b62009-01-31 16:36:08 +00009537 if (type != &PyUnicode_Type)
9538 return unicode_subtype_new(type, args, kwds);
9539 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 return NULL;
9542 if (x == NULL)
9543 return (PyObject *)_PyUnicode_New(0);
9544 if (encoding == NULL && errors == NULL)
9545 return PyObject_Str(x);
9546 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009548}
9549
Guido van Rossume023fe02001-08-30 03:12:59 +00009550static PyObject *
9551unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9552{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009553 PyUnicodeObject *tmp, *pnew;
9554 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009555
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9557 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9558 if (tmp == NULL)
9559 return NULL;
9560 assert(PyUnicode_Check(tmp));
9561 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9562 if (pnew == NULL) {
9563 Py_DECREF(tmp);
9564 return NULL;
9565 }
9566 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9567 if (pnew->str == NULL) {
9568 _Py_ForgetReference((PyObject *)pnew);
9569 PyObject_Del(pnew);
9570 Py_DECREF(tmp);
9571 return PyErr_NoMemory();
9572 }
9573 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9574 pnew->length = n;
9575 pnew->hash = tmp->hash;
9576 Py_DECREF(tmp);
9577 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009578}
9579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009580PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009582\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009583Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009584encoding defaults to the current default string encoding.\n\
9585errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009586
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009587static PyObject *unicode_iter(PyObject *seq);
9588
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009590 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009591 "str", /* tp_name */
9592 sizeof(PyUnicodeObject), /* tp_size */
9593 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009595 (destructor)unicode_dealloc, /* tp_dealloc */
9596 0, /* tp_print */
9597 0, /* tp_getattr */
9598 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009599 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 unicode_repr, /* tp_repr */
9601 &unicode_as_number, /* tp_as_number */
9602 &unicode_as_sequence, /* tp_as_sequence */
9603 &unicode_as_mapping, /* tp_as_mapping */
9604 (hashfunc) unicode_hash, /* tp_hash*/
9605 0, /* tp_call*/
9606 (reprfunc) unicode_str, /* tp_str */
9607 PyObject_GenericGetAttr, /* tp_getattro */
9608 0, /* tp_setattro */
9609 0, /* tp_as_buffer */
9610 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 unicode_doc, /* tp_doc */
9613 0, /* tp_traverse */
9614 0, /* tp_clear */
9615 PyUnicode_RichCompare, /* tp_richcompare */
9616 0, /* tp_weaklistoffset */
9617 unicode_iter, /* tp_iter */
9618 0, /* tp_iternext */
9619 unicode_methods, /* tp_methods */
9620 0, /* tp_members */
9621 0, /* tp_getset */
9622 &PyBaseObject_Type, /* tp_base */
9623 0, /* tp_dict */
9624 0, /* tp_descr_get */
9625 0, /* tp_descr_set */
9626 0, /* tp_dictoffset */
9627 0, /* tp_init */
9628 0, /* tp_alloc */
9629 unicode_new, /* tp_new */
9630 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631};
9632
9633/* Initialize the Unicode implementation */
9634
Thomas Wouters78890102000-07-22 19:25:51 +00009635void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009637 int i;
9638
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639 /* XXX - move this array to unicodectype.c ? */
9640 Py_UNICODE linebreak[] = {
9641 0x000A, /* LINE FEED */
9642 0x000D, /* CARRIAGE RETURN */
9643 0x001C, /* FILE SEPARATOR */
9644 0x001D, /* GROUP SEPARATOR */
9645 0x001E, /* RECORD SEPARATOR */
9646 0x0085, /* NEXT LINE */
9647 0x2028, /* LINE SEPARATOR */
9648 0x2029, /* PARAGRAPH SEPARATOR */
9649 };
9650
Fred Drakee4315f52000-05-09 19:53:39 +00009651 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009652 free_list = NULL;
9653 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009655 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009657
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009658 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009660 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009662
9663 /* initialize the linebreak bloom filter */
9664 bloom_linebreak = make_bloom_mask(
9665 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9666 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009667
9668 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669}
9670
9671/* Finalize the Unicode implementation */
9672
Christian Heimesa156e092008-02-16 07:38:31 +00009673int
9674PyUnicode_ClearFreeList(void)
9675{
9676 int freelist_size = numfree;
9677 PyUnicodeObject *u;
9678
9679 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 PyUnicodeObject *v = u;
9681 u = *(PyUnicodeObject **)u;
9682 if (v->str)
9683 PyObject_DEL(v->str);
9684 Py_XDECREF(v->defenc);
9685 PyObject_Del(v);
9686 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009687 }
9688 free_list = NULL;
9689 assert(numfree == 0);
9690 return freelist_size;
9691}
9692
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693void
Thomas Wouters78890102000-07-22 19:25:51 +00009694_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009696 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009698 Py_XDECREF(unicode_empty);
9699 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009700
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009701 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 if (unicode_latin1[i]) {
9703 Py_DECREF(unicode_latin1[i]);
9704 unicode_latin1[i] = NULL;
9705 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009706 }
Christian Heimesa156e092008-02-16 07:38:31 +00009707 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009709
Walter Dörwald16807132007-05-25 13:52:07 +00009710void
9711PyUnicode_InternInPlace(PyObject **p)
9712{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009713 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9714 PyObject *t;
9715 if (s == NULL || !PyUnicode_Check(s))
9716 Py_FatalError(
9717 "PyUnicode_InternInPlace: unicode strings only please!");
9718 /* If it's a subclass, we don't really know what putting
9719 it in the interned dict might do. */
9720 if (!PyUnicode_CheckExact(s))
9721 return;
9722 if (PyUnicode_CHECK_INTERNED(s))
9723 return;
9724 if (interned == NULL) {
9725 interned = PyDict_New();
9726 if (interned == NULL) {
9727 PyErr_Clear(); /* Don't leave an exception */
9728 return;
9729 }
9730 }
9731 /* It might be that the GetItem call fails even
9732 though the key is present in the dictionary,
9733 namely when this happens during a stack overflow. */
9734 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009736 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009737
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 if (t) {
9739 Py_INCREF(t);
9740 Py_DECREF(*p);
9741 *p = t;
9742 return;
9743 }
Walter Dörwald16807132007-05-25 13:52:07 +00009744
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 PyThreadState_GET()->recursion_critical = 1;
9746 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9747 PyErr_Clear();
9748 PyThreadState_GET()->recursion_critical = 0;
9749 return;
9750 }
9751 PyThreadState_GET()->recursion_critical = 0;
9752 /* The two references in interned are not counted by refcnt.
9753 The deallocator will take care of this */
9754 Py_REFCNT(s) -= 2;
9755 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009756}
9757
9758void
9759PyUnicode_InternImmortal(PyObject **p)
9760{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009761 PyUnicode_InternInPlace(p);
9762 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9763 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9764 Py_INCREF(*p);
9765 }
Walter Dörwald16807132007-05-25 13:52:07 +00009766}
9767
9768PyObject *
9769PyUnicode_InternFromString(const char *cp)
9770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009771 PyObject *s = PyUnicode_FromString(cp);
9772 if (s == NULL)
9773 return NULL;
9774 PyUnicode_InternInPlace(&s);
9775 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009776}
9777
9778void _Py_ReleaseInternedUnicodeStrings(void)
9779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 PyObject *keys;
9781 PyUnicodeObject *s;
9782 Py_ssize_t i, n;
9783 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009784
Benjamin Peterson14339b62009-01-31 16:36:08 +00009785 if (interned == NULL || !PyDict_Check(interned))
9786 return;
9787 keys = PyDict_Keys(interned);
9788 if (keys == NULL || !PyList_Check(keys)) {
9789 PyErr_Clear();
9790 return;
9791 }
Walter Dörwald16807132007-05-25 13:52:07 +00009792
Benjamin Peterson14339b62009-01-31 16:36:08 +00009793 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9794 detector, interned unicode strings are not forcibly deallocated;
9795 rather, we give them their stolen references back, and then clear
9796 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009797
Benjamin Peterson14339b62009-01-31 16:36:08 +00009798 n = PyList_GET_SIZE(keys);
9799 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009800 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009801 for (i = 0; i < n; i++) {
9802 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9803 switch (s->state) {
9804 case SSTATE_NOT_INTERNED:
9805 /* XXX Shouldn't happen */
9806 break;
9807 case SSTATE_INTERNED_IMMORTAL:
9808 Py_REFCNT(s) += 1;
9809 immortal_size += s->length;
9810 break;
9811 case SSTATE_INTERNED_MORTAL:
9812 Py_REFCNT(s) += 2;
9813 mortal_size += s->length;
9814 break;
9815 default:
9816 Py_FatalError("Inconsistent interned string state.");
9817 }
9818 s->state = SSTATE_NOT_INTERNED;
9819 }
9820 fprintf(stderr, "total size of all interned strings: "
9821 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9822 "mortal/immortal\n", mortal_size, immortal_size);
9823 Py_DECREF(keys);
9824 PyDict_Clear(interned);
9825 Py_DECREF(interned);
9826 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009827}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009828
9829
9830/********************* Unicode Iterator **************************/
9831
9832typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009833 PyObject_HEAD
9834 Py_ssize_t it_index;
9835 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009836} unicodeiterobject;
9837
9838static void
9839unicodeiter_dealloc(unicodeiterobject *it)
9840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009841 _PyObject_GC_UNTRACK(it);
9842 Py_XDECREF(it->it_seq);
9843 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009844}
9845
9846static int
9847unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009849 Py_VISIT(it->it_seq);
9850 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009851}
9852
9853static PyObject *
9854unicodeiter_next(unicodeiterobject *it)
9855{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009856 PyUnicodeObject *seq;
9857 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009858
Benjamin Peterson14339b62009-01-31 16:36:08 +00009859 assert(it != NULL);
9860 seq = it->it_seq;
9861 if (seq == NULL)
9862 return NULL;
9863 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009864
Benjamin Peterson14339b62009-01-31 16:36:08 +00009865 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9866 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009867 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009868 if (item != NULL)
9869 ++it->it_index;
9870 return item;
9871 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009872
Benjamin Peterson14339b62009-01-31 16:36:08 +00009873 Py_DECREF(seq);
9874 it->it_seq = NULL;
9875 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009876}
9877
9878static PyObject *
9879unicodeiter_len(unicodeiterobject *it)
9880{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 Py_ssize_t len = 0;
9882 if (it->it_seq)
9883 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9884 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009885}
9886
9887PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9888
9889static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009891 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009892 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009893};
9894
9895PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9897 "str_iterator", /* tp_name */
9898 sizeof(unicodeiterobject), /* tp_basicsize */
9899 0, /* tp_itemsize */
9900 /* methods */
9901 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9902 0, /* tp_print */
9903 0, /* tp_getattr */
9904 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009905 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009906 0, /* tp_repr */
9907 0, /* tp_as_number */
9908 0, /* tp_as_sequence */
9909 0, /* tp_as_mapping */
9910 0, /* tp_hash */
9911 0, /* tp_call */
9912 0, /* tp_str */
9913 PyObject_GenericGetAttr, /* tp_getattro */
9914 0, /* tp_setattro */
9915 0, /* tp_as_buffer */
9916 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9917 0, /* tp_doc */
9918 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9919 0, /* tp_clear */
9920 0, /* tp_richcompare */
9921 0, /* tp_weaklistoffset */
9922 PyObject_SelfIter, /* tp_iter */
9923 (iternextfunc)unicodeiter_next, /* tp_iternext */
9924 unicodeiter_methods, /* tp_methods */
9925 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009926};
9927
9928static PyObject *
9929unicode_iter(PyObject *seq)
9930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009932
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 if (!PyUnicode_Check(seq)) {
9934 PyErr_BadInternalCall();
9935 return NULL;
9936 }
9937 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9938 if (it == NULL)
9939 return NULL;
9940 it->it_index = 0;
9941 Py_INCREF(seq);
9942 it->it_seq = (PyUnicodeObject *)seq;
9943 _PyObject_GC_TRACK(it);
9944 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009945}
9946
Martin v. Löwis5b222132007-06-10 09:51:05 +00009947size_t
9948Py_UNICODE_strlen(const Py_UNICODE *u)
9949{
9950 int res = 0;
9951 while(*u++)
9952 res++;
9953 return res;
9954}
9955
9956Py_UNICODE*
9957Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9958{
9959 Py_UNICODE *u = s1;
9960 while ((*u++ = *s2++));
9961 return s1;
9962}
9963
9964Py_UNICODE*
9965Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9966{
9967 Py_UNICODE *u = s1;
9968 while ((*u++ = *s2++))
9969 if (n-- == 0)
9970 break;
9971 return s1;
9972}
9973
9974int
9975Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9976{
9977 while (*s1 && *s2 && *s1 == *s2)
9978 s1++, s2++;
9979 if (*s1 && *s2)
9980 return (*s1 < *s2) ? -1 : +1;
9981 if (*s1)
9982 return 1;
9983 if (*s2)
9984 return -1;
9985 return 0;
9986}
9987
Victor Stinneref8d95c2010-08-16 22:03:11 +00009988int
9989Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9990{
9991 register Py_UNICODE u1, u2;
9992 for (; n != 0; n--) {
9993 u1 = *s1;
9994 u2 = *s2;
9995 if (u1 != u2)
9996 return (u1 < u2) ? -1 : +1;
9997 if (u1 == '\0')
9998 return 0;
9999 s1++;
10000 s2++;
10001 }
10002 return 0;
10003}
10004
Martin v. Löwis5b222132007-06-10 09:51:05 +000010005Py_UNICODE*
10006Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10007{
10008 const Py_UNICODE *p;
10009 for (p = s; *p; p++)
10010 if (*p == c)
10011 return (Py_UNICODE*)p;
10012 return NULL;
10013}
10014
Victor Stinner331ea922010-08-10 16:37:20 +000010015Py_UNICODE*
10016Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10017{
10018 const Py_UNICODE *p;
10019 p = s + Py_UNICODE_strlen(s);
10020 while (p != s) {
10021 p--;
10022 if (*p == c)
10023 return (Py_UNICODE*)p;
10024 }
10025 return NULL;
10026}
10027
Martin v. Löwis5b222132007-06-10 09:51:05 +000010028
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010029#ifdef __cplusplus
10030}
10031#endif