blob: 60124929cb09544cdcd65bc80318012837e3d03b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000310 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000313
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 return 0;
315}
316
317/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000318 Ux0000 terminated; some code (e.g. new_identifier)
319 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320
321 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
324*/
325
326static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000327PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328{
329 register PyUnicodeObject *unicode;
330
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 if (length == 0 && unicode_empty != NULL) {
333 Py_INCREF(unicode_empty);
334 return unicode_empty;
335 }
336
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000337 /* Ensure we won't overflow the size. */
338 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
339 return (PyUnicodeObject *)PyErr_NoMemory();
340 }
341
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000343 if (free_list) {
344 unicode = free_list;
345 free_list = *(PyUnicodeObject **)unicode;
346 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 if (unicode->str) {
348 /* Keep-Alive optimization: we only upsize the buffer,
349 never downsize it. */
350 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000351 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 PyObject_DEL(unicode->str);
353 unicode->str = NULL;
354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000359 }
360 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (unicode == NULL)
366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
368 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
370
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000371 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 PyErr_NoMemory();
373 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000374 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000376 * the caller fails before initializing str -- unicode_resize()
377 * reads str[0], and the Keep-Alive optimization can keep memory
378 * allocated for str alive across a call to unicode_dealloc(unicode).
379 * We don't want unicode_resize to read uninitialized memory in
380 * that case.
381 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000382 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000386 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000387 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000391 /* XXX UNREF/NEWREF interface should be more symmetrical */
392 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000393 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000394 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396}
397
398static
Guido van Rossum9475a232001-10-05 20:51:39 +0000399void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
Walter Dörwald16807132007-05-25 13:52:07 +0000401 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000402 case SSTATE_NOT_INTERNED:
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_MORTAL:
406 /* revive dead object temporarily for DelItem */
407 Py_REFCNT(unicode) = 3;
408 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
409 Py_FatalError(
410 "deletion of interned string failed");
411 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000412
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 case SSTATE_INTERNED_IMMORTAL:
414 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000415
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 default:
417 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000418 }
419
Guido van Rossum604ddf82001-12-06 20:03:56 +0000420 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000422 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000423 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
424 PyObject_DEL(unicode->str);
425 unicode->str = NULL;
426 unicode->length = 0;
427 }
428 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000429 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000430 }
431 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000432 *(PyUnicodeObject **)unicode = free_list;
433 free_list = unicode;
434 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyObject_DEL(unicode->str);
438 Py_XDECREF(unicode->defenc);
439 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 }
441}
442
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443static
444int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445{
446 register PyUnicodeObject *v;
447
448 /* Argument checks */
449 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 PyErr_BadInternalCall();
451 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000453 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000454 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 PyErr_BadInternalCall();
456 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
458
459 /* Resizing unicode_empty and single character objects is not
460 possible since these are being shared. We simply return a fresh
461 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000462 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 (v == unicode_empty || v->length == 1)) {
464 PyUnicodeObject *w = _PyUnicode_New(length);
465 if (w == NULL)
466 return -1;
467 Py_UNICODE_COPY(w->str, v->str,
468 length < v->length ? length : v->length);
469 Py_DECREF(*unicode);
470 *unicode = w;
471 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
473
474 /* Note that we don't have to modify *unicode for unshared Unicode
475 objects, since we can modify them in-place. */
476 return unicode_resize(v, length);
477}
478
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000479int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
480{
481 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
482}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486{
487 PyUnicodeObject *unicode;
488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000489 /* If the Unicode data is known at construction time, we can apply
490 some optimizations which share commonly used objects. */
491 if (u != NULL) {
492
Benjamin Peterson29060642009-01-31 22:14:21 +0000493 /* Optimization for empty strings */
494 if (size == 0 && unicode_empty != NULL) {
495 Py_INCREF(unicode_empty);
496 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000497 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000498
499 /* Single character Unicode objects in the Latin-1 range are
500 shared when using this constructor */
501 if (size == 1 && *u < 256) {
502 unicode = unicode_latin1[*u];
503 if (!unicode) {
504 unicode = _PyUnicode_New(1);
505 if (!unicode)
506 return NULL;
507 unicode->str[0] = *u;
508 unicode_latin1[*u] = unicode;
509 }
510 Py_INCREF(unicode);
511 return (PyObject *)unicode;
512 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518
519 /* Copy the Unicode data into the new object */
520 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522
523 return (PyObject *)unicode;
524}
525
Walter Dörwaldd2034312007-05-18 16:29:38 +0000526PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527{
528 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 if (size < 0) {
531 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 return NULL;
534 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000535
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000537 some optimizations which share commonly used objects.
538 Also, this means the input must be UTF-8, so fall back to the
539 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 if (u != NULL) {
541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542 /* Optimization for empty strings */
543 if (size == 0 && unicode_empty != NULL) {
544 Py_INCREF(unicode_empty);
545 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000546 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000547
548 /* Single characters are shared when using this constructor.
549 Restrict to ASCII, since the input must be UTF-8. */
550 if (size == 1 && Py_CHARMASK(*u) < 128) {
551 unicode = unicode_latin1[Py_CHARMASK(*u)];
552 if (!unicode) {
553 unicode = _PyUnicode_New(1);
554 if (!unicode)
555 return NULL;
556 unicode->str[0] = Py_CHARMASK(*u);
557 unicode_latin1[Py_CHARMASK(*u)] = unicode;
558 }
559 Py_INCREF(unicode);
560 return (PyObject *)unicode;
561 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000562
563 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 }
565
Walter Dörwald55507312007-05-18 13:12:10 +0000566 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 if (!unicode)
568 return NULL;
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570 return (PyObject *)unicode;
571}
572
Walter Dörwaldd2034312007-05-18 16:29:38 +0000573PyObject *PyUnicode_FromString(const char *u)
574{
575 size_t size = strlen(u);
576 if (size > PY_SSIZE_T_MAX) {
577 PyErr_SetString(PyExc_OverflowError, "input too long");
578 return NULL;
579 }
580
581 return PyUnicode_FromStringAndSize(u, size);
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584#ifdef HAVE_WCHAR_H
585
Mark Dickinson081dfee2009-03-18 14:47:41 +0000586#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
587# define CONVERT_WCHAR_TO_SURROGATES
588#endif
589
590#ifdef CONVERT_WCHAR_TO_SURROGATES
591
592/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
593 to convert from UTF32 to UTF16. */
594
595PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
596 Py_ssize_t size)
597{
598 PyUnicodeObject *unicode;
599 register Py_ssize_t i;
600 Py_ssize_t alloc;
601 const wchar_t *orig_w;
602
603 if (w == NULL) {
604 if (size == 0)
605 return PyUnicode_FromStringAndSize(NULL, 0);
606 PyErr_BadInternalCall();
607 return NULL;
608 }
609
610 if (size == -1) {
611 size = wcslen(w);
612 }
613
614 alloc = size;
615 orig_w = w;
616 for (i = size; i > 0; i--) {
617 if (*w > 0xFFFF)
618 alloc++;
619 w++;
620 }
621 w = orig_w;
622 unicode = _PyUnicode_New(alloc);
623 if (!unicode)
624 return NULL;
625
626 /* Copy the wchar_t data into the new object */
627 {
628 register Py_UNICODE *u;
629 u = PyUnicode_AS_UNICODE(unicode);
630 for (i = size; i > 0; i--) {
631 if (*w > 0xFFFF) {
632 wchar_t ordinal = *w++;
633 ordinal -= 0x10000;
634 *u++ = 0xD800 | (ordinal >> 10);
635 *u++ = 0xDC00 | (ordinal & 0x3FF);
636 }
637 else
638 *u++ = *w++;
639 }
640 }
641 return (PyObject *)unicode;
642}
643
644#else
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000647 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648{
649 PyUnicodeObject *unicode;
650
651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == 0)
653 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 PyErr_BadInternalCall();
655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 }
657
Martin v. Löwis790465f2008-04-05 20:41:37 +0000658 if (size == -1) {
659 size = wcslen(w);
660 }
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = _PyUnicode_New(size);
663 if (!unicode)
664 return NULL;
665
666 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000667#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000669#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000671 register Py_UNICODE *u;
672 register Py_ssize_t i;
673 u = PyUnicode_AS_UNICODE(unicode);
674 for (i = size; i > 0; i--)
675 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000676 }
677#endif
678
679 return (PyObject *)unicode;
680}
681
Mark Dickinson081dfee2009-03-18 14:47:41 +0000682#endif /* CONVERT_WCHAR_TO_SURROGATES */
683
684#undef CONVERT_WCHAR_TO_SURROGATES
685
Walter Dörwald346737f2007-05-31 10:44:43 +0000686static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000687makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
688 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000689{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000690 *fmt++ = '%';
691 if (width) {
692 if (zeropad)
693 *fmt++ = '0';
694 fmt += sprintf(fmt, "%d", width);
695 }
696 if (precision)
697 fmt += sprintf(fmt, ".%d", precision);
698 if (longflag)
699 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000700 else if (longlongflag) {
701 /* longlongflag should only ever be nonzero on machines with
702 HAVE_LONG_LONG defined */
703#ifdef HAVE_LONG_LONG
704 char *f = PY_FORMAT_LONG_LONG;
705 while (*f)
706 *fmt++ = *f++;
707#else
708 /* we shouldn't ever get here */
709 assert(0);
710 *fmt++ = 'l';
711#endif
712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 else if (size_tflag) {
714 char *f = PY_FORMAT_SIZE_T;
715 while (*f)
716 *fmt++ = *f++;
717 }
718 *fmt++ = c;
719 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000720}
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
723
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000724/* size of fixed-size buffer for formatting single arguments */
725#define ITEM_BUFFER_LEN 21
726/* maximum number of characters required for output of %ld. 21 characters
727 allows for 64-bit integers (in decimal) and an optional sign. */
728#define MAX_LONG_CHARS 21
729/* maximum number of characters required for output of %lld.
730 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
731 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
732#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
733
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734PyObject *
735PyUnicode_FromFormatV(const char *format, va_list vargs)
736{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000737 va_list count;
738 Py_ssize_t callcount = 0;
739 PyObject **callresults = NULL;
740 PyObject **callresult = NULL;
741 Py_ssize_t n = 0;
742 int width = 0;
743 int precision = 0;
744 int zeropad;
745 const char* f;
746 Py_UNICODE *s;
747 PyObject *string;
748 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 /* use abuffer instead of buffer, if we need more space
751 * (which can happen if there's a format specifier with width). */
752 char *abuffer = NULL;
753 char *realbuffer;
754 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000755 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000758 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 /* step 1: count the number of %S/%R/%A/%s format specifications
760 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
761 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
762 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000763 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000764 if (*f == '%') {
765 if (*(f+1)=='%')
766 continue;
767 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
768 ++callcount;
769 while (ISDIGIT((unsigned)*f))
770 width = (width*10) + *f++ - '0';
771 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
772 ;
773 if (*f == 's')
774 ++callcount;
775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 }
777 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000778 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000779 if (callcount) {
780 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
781 if (!callresults) {
782 PyErr_NoMemory();
783 return NULL;
784 }
785 callresult = callresults;
786 }
787 /* step 3: figure out how large a buffer we need */
788 for (f = format; *f; f++) {
789 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000790#ifdef HAVE_LONG_LONG
791 int longlongflag = 0;
792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 const char* p = f;
794 width = 0;
795 while (ISDIGIT((unsigned)*f))
796 width = (width*10) + *f++ - '0';
797 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
798 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
801 * they don't affect the amount of space we reserve.
802 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000803 if (*f == 'l') {
804 if (f[1] == 'd' || f[1] == 'u') {
805 ++f;
806 }
807#ifdef HAVE_LONG_LONG
808 else if (f[1] == 'l' &&
809 (f[2] == 'd' || f[2] == 'u')) {
810 longlongflag = 1;
811 f += 2;
812 }
813#endif
814 }
815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000816 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Benjamin Peterson14339b62009-01-31 16:36:08 +0000819 switch (*f) {
820 case 'c':
821 (void)va_arg(count, int);
822 /* fall through... */
823 case '%':
824 n++;
825 break;
826 case 'd': case 'u': case 'i': case 'x':
827 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000828#ifdef HAVE_LONG_LONG
829 if (longlongflag) {
830 if (width < MAX_LONG_LONG_CHARS)
831 width = MAX_LONG_LONG_CHARS;
832 }
833 else
834#endif
835 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
836 including sign. Decimal takes the most space. This
837 isn't enough for octal. If a width is specified we
838 need more (which we allocate later). */
839 if (width < MAX_LONG_CHARS)
840 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000842 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 if (abuffersize < width)
844 abuffersize = width;
845 break;
846 case 's':
847 {
848 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000849 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000850 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
851 if (!str)
852 goto fail;
853 n += PyUnicode_GET_SIZE(str);
854 /* Remember the str and switch to the next slot */
855 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 break;
857 }
858 case 'U':
859 {
860 PyObject *obj = va_arg(count, PyObject *);
861 assert(obj && PyUnicode_Check(obj));
862 n += PyUnicode_GET_SIZE(obj);
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(count, PyObject *);
868 const char *str = va_arg(count, const char *);
869 assert(obj || str);
870 assert(!obj || PyUnicode_Check(obj));
871 if (obj)
872 n += PyUnicode_GET_SIZE(obj);
873 else
874 n += strlen(str);
875 break;
876 }
877 case 'S':
878 {
879 PyObject *obj = va_arg(count, PyObject *);
880 PyObject *str;
881 assert(obj);
882 str = PyObject_Str(obj);
883 if (!str)
884 goto fail;
885 n += PyUnicode_GET_SIZE(str);
886 /* Remember the str and switch to the next slot */
887 *callresult++ = str;
888 break;
889 }
890 case 'R':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *repr;
894 assert(obj);
895 repr = PyObject_Repr(obj);
896 if (!repr)
897 goto fail;
898 n += PyUnicode_GET_SIZE(repr);
899 /* Remember the repr and switch to the next slot */
900 *callresult++ = repr;
901 break;
902 }
903 case 'A':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *ascii;
907 assert(obj);
908 ascii = PyObject_ASCII(obj);
909 if (!ascii)
910 goto fail;
911 n += PyUnicode_GET_SIZE(ascii);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = ascii;
914 break;
915 }
916 case 'p':
917 (void) va_arg(count, int);
918 /* maximum 64-bit pointer representation:
919 * 0xffffffffffffffff
920 * so 19 characters is enough.
921 * XXX I count 18 -- what's the extra for?
922 */
923 n += 19;
924 break;
925 default:
926 /* if we stumble upon an unknown
927 formatting code, copy the rest of
928 the format string to the output
929 string. (we cannot just skip the
930 code, since there's no way to know
931 what's in the argument list) */
932 n += strlen(p);
933 goto expand;
934 }
935 } else
936 n++;
937 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000939 if (abuffersize > ITEM_BUFFER_LEN) {
940 /* add 1 for sprintf's trailing null byte */
941 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000942 if (!abuffer) {
943 PyErr_NoMemory();
944 goto fail;
945 }
946 realbuffer = abuffer;
947 }
948 else
949 realbuffer = buffer;
950 /* step 4: fill the buffer */
951 /* Since we've analyzed how much space we need for the worst case,
952 we don't have to resize the string.
953 There can be no errors beyond this point. */
954 string = PyUnicode_FromUnicode(NULL, n);
955 if (!string)
956 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 s = PyUnicode_AS_UNICODE(string);
959 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 for (f = format; *f; f++) {
962 if (*f == '%') {
963 const char* p = f++;
964 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000965 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 int size_tflag = 0;
967 zeropad = (*f == '0');
968 /* parse the width.precision part */
969 width = 0;
970 while (ISDIGIT((unsigned)*f))
971 width = (width*10) + *f++ - '0';
972 precision = 0;
973 if (*f == '.') {
974 f++;
975 while (ISDIGIT((unsigned)*f))
976 precision = (precision*10) + *f++ - '0';
977 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 /* Handle %ld, %lu, %lld and %llu. */
979 if (*f == 'l') {
980 if (f[1] == 'd' || f[1] == 'u') {
981 longflag = 1;
982 ++f;
983 }
984#ifdef HAVE_LONG_LONG
985 else if (f[1] == 'l' &&
986 (f[2] == 'd' || f[2] == 'u')) {
987 longlongflag = 1;
988 f += 2;
989 }
990#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000991 }
992 /* handle the size_t flag. */
993 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
994 size_tflag = 1;
995 ++f;
996 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000997
Benjamin Peterson14339b62009-01-31 16:36:08 +0000998 switch (*f) {
999 case 'c':
1000 *s++ = va_arg(vargs, int);
1001 break;
1002 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1004 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 if (longflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007#ifdef HAVE_LONG_LONG
1008 else if (longlongflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 else if (size_tflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1013 else
1014 sprintf(realbuffer, fmt, va_arg(vargs, int));
1015 appendstring(realbuffer);
1016 break;
1017 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001018 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1019 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 if (longflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022#ifdef HAVE_LONG_LONG
1023 else if (longlongflag)
1024 sprintf(realbuffer, fmt, va_arg(vargs,
1025 unsigned PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001039 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001040 sprintf(realbuffer, fmt, va_arg(vargs, int));
1041 appendstring(realbuffer);
1042 break;
1043 case 's':
1044 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001045 /* unused, since we already have the result */
1046 (void) va_arg(vargs, char *);
1047 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1048 PyUnicode_GET_SIZE(*callresult));
1049 s += PyUnicode_GET_SIZE(*callresult);
1050 /* We're done with the unicode()/repr() => forget it */
1051 Py_DECREF(*callresult);
1052 /* switch to next unicode()/repr() result */
1053 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 break;
1055 }
1056 case 'U':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1060 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1061 s += size;
1062 break;
1063 }
1064 case 'V':
1065 {
1066 PyObject *obj = va_arg(vargs, PyObject *);
1067 const char *str = va_arg(vargs, const char *);
1068 if (obj) {
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 } else {
1073 appendstring(str);
1074 }
1075 break;
1076 }
1077 case 'S':
1078 case 'R':
1079 {
1080 Py_UNICODE *ucopy;
1081 Py_ssize_t usize;
1082 Py_ssize_t upos;
1083 /* unused, since we already have the result */
1084 (void) va_arg(vargs, PyObject *);
1085 ucopy = PyUnicode_AS_UNICODE(*callresult);
1086 usize = PyUnicode_GET_SIZE(*callresult);
1087 for (upos = 0; upos<usize;)
1088 *s++ = ucopy[upos++];
1089 /* We're done with the unicode()/repr() => forget it */
1090 Py_DECREF(*callresult);
1091 /* switch to next unicode()/repr() result */
1092 ++callresult;
1093 break;
1094 }
1095 case 'p':
1096 sprintf(buffer, "%p", va_arg(vargs, void*));
1097 /* %p is ill-defined: ensure leading 0x. */
1098 if (buffer[1] == 'X')
1099 buffer[1] = 'x';
1100 else if (buffer[1] != 'x') {
1101 memmove(buffer+2, buffer, strlen(buffer)+1);
1102 buffer[0] = '0';
1103 buffer[1] = 'x';
1104 }
1105 appendstring(buffer);
1106 break;
1107 case '%':
1108 *s++ = '%';
1109 break;
1110 default:
1111 appendstring(p);
1112 goto end;
1113 }
1114 } else
1115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 wchar_t *w,
1159 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160{
1161 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 PyErr_BadInternalCall();
1163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001165
1166 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
Daniel Stutzbach8515eae2010-08-24 21:57:33 +00001170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 memcpy(w, unicode->str, size * sizeof(wchar_t));
1172#else
1173 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 register Py_UNICODE *u;
1175 register Py_ssize_t i;
1176 u = PyUnicode_AS_UNICODE(unicode);
1177 for (i = size; i > 0; i--)
1178 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180#endif
1181
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001182 if (size > PyUnicode_GET_SIZE(unicode))
1183 return PyUnicode_GET_SIZE(unicode);
1184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186}
1187
1188#endif
1189
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190PyObject *PyUnicode_FromOrdinal(int ordinal)
1191{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001192 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 PyErr_SetString(PyExc_ValueError,
1196 "chr() arg not in range(0x110000)");
1197 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001199
1200#ifndef Py_UNICODE_WIDE
1201 if (ordinal > 0xffff) {
1202 ordinal -= 0x10000;
1203 s[0] = 0xD800 | (ordinal >> 10);
1204 s[1] = 0xDC00 | (ordinal & 0x3FF);
1205 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001206 }
1207#endif
1208
Hye-Shik Chang40574832004-04-06 07:24:51 +00001209 s[0] = (Py_UNICODE)ordinal;
1210 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_FromObject(register PyObject *obj)
1214{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001215 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 Py_INCREF(obj);
1219 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001220 }
1221 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 /* For a Unicode subtype that's not a Unicode object,
1223 return a true Unicode object with the same data. */
1224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1225 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001227 PyErr_Format(PyExc_TypeError,
1228 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001229 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001230 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231}
1232
1233PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 const char *encoding,
1235 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001237 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001238 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001241 PyErr_BadInternalCall();
1242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001244
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001245 /* Decoding bytes objects is the most common case and should be fast */
1246 if (PyBytes_Check(obj)) {
1247 if (PyBytes_GET_SIZE(obj) == 0) {
1248 Py_INCREF(unicode_empty);
1249 v = (PyObject *) unicode_empty;
1250 }
1251 else {
1252 v = PyUnicode_Decode(
1253 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1254 encoding, errors);
1255 }
1256 return v;
1257 }
1258
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001259 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001260 PyErr_SetString(PyExc_TypeError,
1261 "decoding str is not supported");
1262 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001263 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001264
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001265 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1266 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1267 PyErr_Format(PyExc_TypeError,
1268 "coercing to str: need bytes, bytearray "
1269 "or buffer-like object, %.80s found",
1270 Py_TYPE(obj)->tp_name);
1271 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001272 }
Tim Petersced69f82003-09-16 20:30:58 +00001273
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001274 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001275 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001276 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 }
Tim Petersced69f82003-09-16 20:30:58 +00001278 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001279 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001280
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001281 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283}
1284
Victor Stinner600d3be2010-06-10 12:00:55 +00001285/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001286 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1287 1 on success. */
1288static int
1289normalize_encoding(const char *encoding,
1290 char *lower,
1291 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001293 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001294 char *l;
1295 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001296
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001297 e = encoding;
1298 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001299 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001300 while (*e) {
1301 if (l == l_end)
1302 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 if (ISUPPER(*e)) {
1304 *l++ = TOLOWER(*e++);
1305 }
1306 else if (*e == '_') {
1307 *l++ = '-';
1308 e++;
1309 }
1310 else {
1311 *l++ = *e++;
1312 }
1313 }
1314 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001315 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001316}
1317
1318PyObject *PyUnicode_Decode(const char *s,
1319 Py_ssize_t size,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *buffer = NULL, *unicode;
1324 Py_buffer info;
1325 char lower[11]; /* Enough for any encoding shortcut */
1326
1327 if (encoding == NULL)
1328 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001329
1330 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001331 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1332 if (strcmp(lower, "utf-8") == 0)
1333 return PyUnicode_DecodeUTF8(s, size, errors);
1334 else if ((strcmp(lower, "latin-1") == 0) ||
1335 (strcmp(lower, "iso-8859-1") == 0))
1336 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001337#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001338 else if (strcmp(lower, "mbcs") == 0)
1339 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001340#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001341 else if (strcmp(lower, "ascii") == 0)
1342 return PyUnicode_DecodeASCII(s, size, errors);
1343 else if (strcmp(lower, "utf-16") == 0)
1344 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1345 else if (strcmp(lower, "utf-32") == 0)
1346 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348
1349 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001350 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001351 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001352 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001353 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 if (buffer == NULL)
1355 goto onError;
1356 unicode = PyCodec_Decode(buffer, encoding, errors);
1357 if (unicode == NULL)
1358 goto onError;
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001361 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001362 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 Py_DECREF(unicode);
1364 goto onError;
1365 }
1366 Py_DECREF(buffer);
1367 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Benjamin Peterson29060642009-01-31 22:14:21 +00001369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 Py_XDECREF(buffer);
1371 return NULL;
1372}
1373
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001374PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1375 const char *encoding,
1376 const char *errors)
1377{
1378 PyObject *v;
1379
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 goto onError;
1383 }
1384
1385 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001387
1388 /* Decode via the codec registry */
1389 v = PyCodec_Decode(unicode, encoding, errors);
1390 if (v == NULL)
1391 goto onError;
1392 return v;
1393
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001395 return NULL;
1396}
1397
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001398PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1399 const char *encoding,
1400 const char *errors)
1401{
1402 PyObject *v;
1403
1404 if (!PyUnicode_Check(unicode)) {
1405 PyErr_BadArgument();
1406 goto onError;
1407 }
1408
1409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001411
1412 /* Decode via the codec registry */
1413 v = PyCodec_Decode(unicode, encoding, errors);
1414 if (v == NULL)
1415 goto onError;
1416 if (!PyUnicode_Check(v)) {
1417 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001418 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001419 Py_TYPE(v)->tp_name);
1420 Py_DECREF(v);
1421 goto onError;
1422 }
1423 return v;
1424
Benjamin Peterson29060642009-01-31 22:14:21 +00001425 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001426 return NULL;
1427}
1428
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001430 Py_ssize_t size,
1431 const char *encoding,
1432 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433{
1434 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001435
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 unicode = PyUnicode_FromUnicode(s, size);
1437 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1440 Py_DECREF(unicode);
1441 return v;
1442}
1443
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001444PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1445 const char *encoding,
1446 const char *errors)
1447{
1448 PyObject *v;
1449
1450 if (!PyUnicode_Check(unicode)) {
1451 PyErr_BadArgument();
1452 goto onError;
1453 }
1454
1455 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001456 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001457
1458 /* Encode via the codec registry */
1459 v = PyCodec_Encode(unicode, encoding, errors);
1460 if (v == NULL)
1461 goto onError;
1462 return v;
1463
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001465 return NULL;
1466}
1467
Victor Stinnerae6265f2010-05-15 16:27:27 +00001468PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1469{
Victor Stinner313a1202010-06-11 23:56:51 +00001470 if (Py_FileSystemDefaultEncoding) {
1471#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1472 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1473 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1474 PyUnicode_GET_SIZE(unicode),
1475 NULL);
1476#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001477 return PyUnicode_AsEncodedString(unicode,
1478 Py_FileSystemDefaultEncoding,
1479 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001480 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001481 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001482 PyUnicode_GET_SIZE(unicode),
1483 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001484}
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1487 const char *encoding,
1488 const char *errors)
1489{
1490 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001491 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001492
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 if (!PyUnicode_Check(unicode)) {
1494 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
Fred Drakee4315f52000-05-09 19:53:39 +00001497
Tim Petersced69f82003-09-16 20:30:58 +00001498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001500
1501 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001502 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1503 if (strcmp(lower, "utf-8") == 0)
1504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 errors);
1507 else if ((strcmp(lower, "latin-1") == 0) ||
1508 (strcmp(lower, "iso-8859-1") == 0))
1509 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "mbcs") == 0)
1514 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001517#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001518 else if (strcmp(lower, "ascii") == 0)
1519 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1520 PyUnicode_GET_SIZE(unicode),
1521 errors);
1522 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001523 /* During bootstrap, we may need to find the encodings
1524 package, to load the file system encoding, and require the
1525 file system encoding in order to load the encodings
1526 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001527
Victor Stinner59e62db2010-05-15 13:14:32 +00001528 Break out of this dependency by assuming that the path to
1529 the encodings module is ASCII-only. XXX could try wcstombs
1530 instead, if the file system encoding is the locale's
1531 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001532 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001533 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1534 !PyThreadState_GET()->interp->codecs_initialized)
1535 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1536 PyUnicode_GET_SIZE(unicode),
1537 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538
1539 /* Encode via the codec registry */
1540 v = PyCodec_Encode(unicode, encoding, errors);
1541 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001542 return NULL;
1543
1544 /* The normal path */
1545 if (PyBytes_Check(v))
1546 return v;
1547
1548 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001549 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001550 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001551 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001552
1553 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1554 "encoder %s returned bytearray instead of bytes",
1555 encoding);
1556 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001557 Py_DECREF(v);
1558 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001561 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1562 Py_DECREF(v);
1563 return b;
1564 }
1565
1566 PyErr_Format(PyExc_TypeError,
1567 "encoder did not return a bytes object (type=%.400s)",
1568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570 return NULL;
1571}
1572
1573PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1574 const char *encoding,
1575 const char *errors)
1576{
1577 PyObject *v;
1578
1579 if (!PyUnicode_Check(unicode)) {
1580 PyErr_BadArgument();
1581 goto onError;
1582 }
1583
1584 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001586
1587 /* Encode via the codec registry */
1588 v = PyCodec_Encode(unicode, encoding, errors);
1589 if (v == NULL)
1590 goto onError;
1591 if (!PyUnicode_Check(v)) {
1592 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001593 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001594 Py_TYPE(v)->tp_name);
1595 Py_DECREF(v);
1596 goto onError;
1597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001599
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 return NULL;
1602}
1603
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001604PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001606{
1607 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001608 if (v)
1609 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001610 if (errors != NULL)
1611 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001612 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001613 PyUnicode_GET_SIZE(unicode),
1614 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001615 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001616 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001617 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 return v;
1619}
1620
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001622PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001623 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001624 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1625}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001626
Christian Heimes5894ba72007-11-04 11:43:14 +00001627PyObject*
1628PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1629{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1631 can be undefined. If it is case, decode using UTF-8. The following assumes
1632 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1633 bootstrapping process where the codecs aren't ready yet.
1634 */
1635 if (Py_FileSystemDefaultEncoding) {
1636#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001637 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001638 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001639 }
1640#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001641 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001642 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001643 }
1644#endif
1645 return PyUnicode_Decode(s, size,
1646 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001647 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001648 }
1649 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001650 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001651 }
1652}
1653
Martin v. Löwis011e8422009-05-05 04:43:17 +00001654
1655int
1656PyUnicode_FSConverter(PyObject* arg, void* addr)
1657{
1658 PyObject *output = NULL;
1659 Py_ssize_t size;
1660 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001661 if (arg == NULL) {
1662 Py_DECREF(*(PyObject**)addr);
1663 return 1;
1664 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001665 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001666 output = arg;
1667 Py_INCREF(output);
1668 }
1669 else {
1670 arg = PyUnicode_FromObject(arg);
1671 if (!arg)
1672 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001673 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001674 Py_DECREF(arg);
1675 if (!output)
1676 return 0;
1677 if (!PyBytes_Check(output)) {
1678 Py_DECREF(output);
1679 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1680 return 0;
1681 }
1682 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001683 size = PyBytes_GET_SIZE(output);
1684 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001685 if (size != strlen(data)) {
1686 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1687 Py_DECREF(output);
1688 return 0;
1689 }
1690 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001691 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001692}
1693
1694
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001695int
1696PyUnicode_FSDecoder(PyObject* arg, void* addr)
1697{
1698 PyObject *output = NULL;
1699 Py_ssize_t size;
1700 void *data;
1701 if (arg == NULL) {
1702 Py_DECREF(*(PyObject**)addr);
1703 return 1;
1704 }
1705 if (PyUnicode_Check(arg)) {
1706 output = arg;
1707 Py_INCREF(output);
1708 }
1709 else {
1710 arg = PyBytes_FromObject(arg);
1711 if (!arg)
1712 return 0;
1713 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1714 PyBytes_GET_SIZE(arg));
1715 Py_DECREF(arg);
1716 if (!output)
1717 return 0;
1718 if (!PyUnicode_Check(output)) {
1719 Py_DECREF(output);
1720 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1721 return 0;
1722 }
1723 }
1724 size = PyUnicode_GET_SIZE(output);
1725 data = PyUnicode_AS_UNICODE(output);
1726 if (size != Py_UNICODE_strlen(data)) {
1727 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1728 Py_DECREF(output);
1729 return 0;
1730 }
1731 *(PyObject**)addr = output;
1732 return Py_CLEANUP_SUPPORTED;
1733}
1734
1735
Martin v. Löwis5b222132007-06-10 09:51:05 +00001736char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001737_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001738{
Christian Heimesf3863112007-11-22 07:46:41 +00001739 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001740 if (!PyUnicode_Check(unicode)) {
1741 PyErr_BadArgument();
1742 return NULL;
1743 }
Christian Heimesf3863112007-11-22 07:46:41 +00001744 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1745 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001746 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001747 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001748 *psize = PyBytes_GET_SIZE(bytes);
1749 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001750}
1751
1752char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001753_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001754{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001755 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001756}
1757
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1759{
1760 if (!PyUnicode_Check(unicode)) {
1761 PyErr_BadArgument();
1762 goto onError;
1763 }
1764 return PyUnicode_AS_UNICODE(unicode);
1765
Benjamin Peterson29060642009-01-31 22:14:21 +00001766 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 return NULL;
1768}
1769
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771{
1772 if (!PyUnicode_Check(unicode)) {
1773 PyErr_BadArgument();
1774 goto onError;
1775 }
1776 return PyUnicode_GET_SIZE(unicode);
1777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 return -1;
1780}
1781
Thomas Wouters78890102000-07-22 19:25:51 +00001782const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001783{
1784 return unicode_default_encoding;
1785}
1786
Victor Stinner554f3f02010-06-16 23:33:54 +00001787/* create or adjust a UnicodeDecodeError */
1788static void
1789make_decode_exception(PyObject **exceptionObject,
1790 const char *encoding,
1791 const char *input, Py_ssize_t length,
1792 Py_ssize_t startpos, Py_ssize_t endpos,
1793 const char *reason)
1794{
1795 if (*exceptionObject == NULL) {
1796 *exceptionObject = PyUnicodeDecodeError_Create(
1797 encoding, input, length, startpos, endpos, reason);
1798 }
1799 else {
1800 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1801 goto onError;
1802 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1803 goto onError;
1804 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1805 goto onError;
1806 }
1807 return;
1808
1809onError:
1810 Py_DECREF(*exceptionObject);
1811 *exceptionObject = NULL;
1812}
1813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814/* error handling callback helper:
1815 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001816 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 and adjust various state variables.
1818 return 0 on success, -1 on error
1819*/
1820
1821static
1822int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 const char *encoding, const char *reason,
1824 const char **input, const char **inend, Py_ssize_t *startinpos,
1825 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1826 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001828 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829
1830 PyObject *restuple = NULL;
1831 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001832 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001833 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001834 Py_ssize_t requiredsize;
1835 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001837 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001838 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 int res = -1;
1840
1841 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001842 *errorHandler = PyCodec_LookupError(errors);
1843 if (*errorHandler == NULL)
1844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 }
1846
Victor Stinner554f3f02010-06-16 23:33:54 +00001847 make_decode_exception(exceptionObject,
1848 encoding,
1849 *input, *inend - *input,
1850 *startinpos, *endinpos,
1851 reason);
1852 if (*exceptionObject == NULL)
1853 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854
1855 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1856 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001859 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 }
1862 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001864
1865 /* Copy back the bytes variables, which might have been modified by the
1866 callback */
1867 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1868 if (!inputobj)
1869 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001870 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001872 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001873 *input = PyBytes_AS_STRING(inputobj);
1874 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001875 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001876 /* we can DECREF safely, as the exception has another reference,
1877 so the object won't go away. */
1878 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001882 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1884 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001885 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886
1887 /* need more space? (at least enough for what we
1888 have+the replacement+the rest of the string (starting
1889 at the new input position), so we won't have to check space
1890 when there are no errors in the rest of the string) */
1891 repptr = PyUnicode_AS_UNICODE(repunicode);
1892 repsize = PyUnicode_GET_SIZE(repunicode);
1893 requiredsize = *outpos + repsize + insize-newpos;
1894 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001895 if (requiredsize<2*outsize)
1896 requiredsize = 2*outsize;
1897 if (_PyUnicode_Resize(output, requiredsize) < 0)
1898 goto onError;
1899 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 }
1901 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001902 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 Py_UNICODE_COPY(*outptr, repptr, repsize);
1904 *outptr += repsize;
1905 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 /* we made it! */
1908 res = 0;
1909
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 Py_XDECREF(restuple);
1912 return res;
1913}
1914
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915/* --- UTF-7 Codec -------------------------------------------------------- */
1916
Antoine Pitrou244651a2009-05-04 18:56:13 +00001917/* See RFC2152 for details. We encode conservatively and decode liberally. */
1918
1919/* Three simple macros defining base-64. */
1920
1921/* Is c a base-64 character? */
1922
1923#define IS_BASE64(c) \
1924 (((c) >= 'A' && (c) <= 'Z') || \
1925 ((c) >= 'a' && (c) <= 'z') || \
1926 ((c) >= '0' && (c) <= '9') || \
1927 (c) == '+' || (c) == '/')
1928
1929/* given that c is a base-64 character, what is its base-64 value? */
1930
1931#define FROM_BASE64(c) \
1932 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1933 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1934 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1935 (c) == '+' ? 62 : 63)
1936
1937/* What is the base-64 character of the bottom 6 bits of n? */
1938
1939#define TO_BASE64(n) \
1940 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1941
1942/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1943 * decoded as itself. We are permissive on decoding; the only ASCII
1944 * byte not decoding to itself is the + which begins a base64
1945 * string. */
1946
1947#define DECODE_DIRECT(c) \
1948 ((c) <= 127 && (c) != '+')
1949
1950/* The UTF-7 encoder treats ASCII characters differently according to
1951 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1952 * the above). See RFC2152. This array identifies these different
1953 * sets:
1954 * 0 : "Set D"
1955 * alphanumeric and '(),-./:?
1956 * 1 : "Set O"
1957 * !"#$%&*;<=>@[]^_`{|}
1958 * 2 : "whitespace"
1959 * ht nl cr sp
1960 * 3 : special (must be base64 encoded)
1961 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1962 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001963
Tim Petersced69f82003-09-16 20:30:58 +00001964static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001965char utf7_category[128] = {
1966/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1967 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1968/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1969 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1970/* sp ! " # $ % & ' ( ) * + , - . / */
1971 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1972/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1973 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1974/* @ A B C D E F G H I J K L M N O */
1975 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1976/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1977 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1978/* ` a b c d e f g h i j k l m n o */
1979 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1980/* p q r s t u v w x y z { | } ~ del */
1981 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001982};
1983
Antoine Pitrou244651a2009-05-04 18:56:13 +00001984/* ENCODE_DIRECT: this character should be encoded as itself. The
1985 * answer depends on whether we are encoding set O as itself, and also
1986 * on whether we are encoding whitespace as itself. RFC2152 makes it
1987 * clear that the answers to these questions vary between
1988 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001989
Antoine Pitrou244651a2009-05-04 18:56:13 +00001990#define ENCODE_DIRECT(c, directO, directWS) \
1991 ((c) < 128 && (c) > 0 && \
1992 ((utf7_category[(c)] == 0) || \
1993 (directWS && (utf7_category[(c)] == 2)) || \
1994 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001995
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 Py_ssize_t size,
1998 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001999{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002000 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2001}
2002
Antoine Pitrou244651a2009-05-04 18:56:13 +00002003/* The decoder. The only state we preserve is our read position,
2004 * i.e. how many characters we have consumed. So if we end in the
2005 * middle of a shift sequence we have to back off the read position
2006 * and the output to the beginning of the sequence, otherwise we lose
2007 * all the shift state (seen bits, number of bits seen, high
2008 * surrogate). */
2009
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002010PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 Py_ssize_t size,
2012 const char *errors,
2013 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002015 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002016 Py_ssize_t startinpos;
2017 Py_ssize_t endinpos;
2018 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002019 const char *e;
2020 PyUnicodeObject *unicode;
2021 Py_UNICODE *p;
2022 const char *errmsg = "";
2023 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 Py_UNICODE *shiftOutStart;
2025 unsigned int base64bits = 0;
2026 unsigned long base64buffer = 0;
2027 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 PyObject *errorHandler = NULL;
2029 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030
2031 unicode = _PyUnicode_New(size);
2032 if (!unicode)
2033 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002034 if (size == 0) {
2035 if (consumed)
2036 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002038 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002039
2040 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002042 e = s + size;
2043
2044 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002047 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048
Antoine Pitrou244651a2009-05-04 18:56:13 +00002049 if (inShift) { /* in a base-64 section */
2050 if (IS_BASE64(ch)) { /* consume a base-64 character */
2051 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2052 base64bits += 6;
2053 s++;
2054 if (base64bits >= 16) {
2055 /* we have enough bits for a UTF-16 value */
2056 Py_UNICODE outCh = (Py_UNICODE)
2057 (base64buffer >> (base64bits-16));
2058 base64bits -= 16;
2059 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2060 if (surrogate) {
2061 /* expecting a second surrogate */
2062 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2063#ifdef Py_UNICODE_WIDE
2064 *p++ = (((surrogate & 0x3FF)<<10)
2065 | (outCh & 0x3FF)) + 0x10000;
2066#else
2067 *p++ = surrogate;
2068 *p++ = outCh;
2069#endif
2070 surrogate = 0;
2071 }
2072 else {
2073 surrogate = 0;
2074 errmsg = "second surrogate missing";
2075 goto utf7Error;
2076 }
2077 }
2078 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2079 /* first surrogate */
2080 surrogate = outCh;
2081 }
2082 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2083 errmsg = "unexpected second surrogate";
2084 goto utf7Error;
2085 }
2086 else {
2087 *p++ = outCh;
2088 }
2089 }
2090 }
2091 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002092 inShift = 0;
2093 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002094 if (surrogate) {
2095 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002096 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002097 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002098 if (base64bits > 0) { /* left-over bits */
2099 if (base64bits >= 6) {
2100 /* We've seen at least one base-64 character */
2101 errmsg = "partial character in shift sequence";
2102 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002104 else {
2105 /* Some bits remain; they should be zero */
2106 if (base64buffer != 0) {
2107 errmsg = "non-zero padding bits in shift sequence";
2108 goto utf7Error;
2109 }
2110 }
2111 }
2112 if (ch != '-') {
2113 /* '-' is absorbed; other terminating
2114 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 *p++ = ch;
2116 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002117 }
2118 }
2119 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002121 s++; /* consume '+' */
2122 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002123 s++;
2124 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002125 }
2126 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002127 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 shiftOutStart = p;
2129 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002130 }
2131 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002132 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 *p++ = ch;
2134 s++;
2135 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136 else {
2137 startinpos = s-starts;
2138 s++;
2139 errmsg = "unexpected special character";
2140 goto utf7Error;
2141 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002143utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002144 outpos = p-PyUnicode_AS_UNICODE(unicode);
2145 endinpos = s-starts;
2146 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002147 errors, &errorHandler,
2148 "utf7", errmsg,
2149 &starts, &e, &startinpos, &endinpos, &exc, &s,
2150 &unicode, &outpos, &p))
2151 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002152 }
2153
Antoine Pitrou244651a2009-05-04 18:56:13 +00002154 /* end of string */
2155
2156 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2157 /* if we're in an inconsistent state, that's an error */
2158 if (surrogate ||
2159 (base64bits >= 6) ||
2160 (base64bits > 0 && base64buffer != 0)) {
2161 outpos = p-PyUnicode_AS_UNICODE(unicode);
2162 endinpos = size;
2163 if (unicode_decode_call_errorhandler(
2164 errors, &errorHandler,
2165 "utf7", "unterminated shift sequence",
2166 &starts, &e, &startinpos, &endinpos, &exc, &s,
2167 &unicode, &outpos, &p))
2168 goto onError;
2169 if (s < e)
2170 goto restart;
2171 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002172 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002173
2174 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002175 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176 if (inShift) {
2177 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002178 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002179 }
2180 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002181 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002182 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002183 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002184
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002185 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 goto onError;
2187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002188 Py_XDECREF(errorHandler);
2189 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190 return (PyObject *)unicode;
2191
Benjamin Peterson29060642009-01-31 22:14:21 +00002192 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 Py_XDECREF(errorHandler);
2194 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002195 Py_DECREF(unicode);
2196 return NULL;
2197}
2198
2199
2200PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002201 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202 int base64SetO,
2203 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002204 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002206 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002208 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002210 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211 unsigned int base64bits = 0;
2212 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213 char * out;
2214 char * start;
2215
2216 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002217 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002219 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002220 return PyErr_NoMemory();
2221
Antoine Pitrou244651a2009-05-04 18:56:13 +00002222 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002223 if (v == NULL)
2224 return NULL;
2225
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002226 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227 for (;i < size; ++i) {
2228 Py_UNICODE ch = s[i];
2229
Antoine Pitrou244651a2009-05-04 18:56:13 +00002230 if (inShift) {
2231 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2232 /* shifting out */
2233 if (base64bits) { /* output remaining bits */
2234 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2235 base64buffer = 0;
2236 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002237 }
2238 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002239 /* Characters not in the BASE64 set implicitly unshift the sequence
2240 so no '-' is required, except if the character is itself a '-' */
2241 if (IS_BASE64(ch) || ch == '-') {
2242 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002243 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002244 *out++ = (char) ch;
2245 }
2246 else {
2247 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002248 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002249 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002250 else { /* not in a shift sequence */
2251 if (ch == '+') {
2252 *out++ = '+';
2253 *out++ = '-';
2254 }
2255 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2256 *out++ = (char) ch;
2257 }
2258 else {
2259 *out++ = '+';
2260 inShift = 1;
2261 goto encode_char;
2262 }
2263 }
2264 continue;
2265encode_char:
2266#ifdef Py_UNICODE_WIDE
2267 if (ch >= 0x10000) {
2268 /* code first surrogate */
2269 base64bits += 16;
2270 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2271 while (base64bits >= 6) {
2272 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2273 base64bits -= 6;
2274 }
2275 /* prepare second surrogate */
2276 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2277 }
2278#endif
2279 base64bits += 16;
2280 base64buffer = (base64buffer << 16) | ch;
2281 while (base64bits >= 6) {
2282 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2283 base64bits -= 6;
2284 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002285 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286 if (base64bits)
2287 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2288 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002289 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002290 if (_PyBytes_Resize(&v, out - start) < 0)
2291 return NULL;
2292 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002293}
2294
Antoine Pitrou244651a2009-05-04 18:56:13 +00002295#undef IS_BASE64
2296#undef FROM_BASE64
2297#undef TO_BASE64
2298#undef DECODE_DIRECT
2299#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301/* --- UTF-8 Codec -------------------------------------------------------- */
2302
Tim Petersced69f82003-09-16 20:30:58 +00002303static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002305 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2306 illegal prefix. See RFC 3629 for details */
2307 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2308 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002309 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2311 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2312 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2313 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002314 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2319 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2320 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2321 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2322 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323};
2324
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002326 Py_ssize_t size,
2327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328{
Walter Dörwald69652032004-09-07 20:24:22 +00002329 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2330}
2331
Antoine Pitrouab868312009-01-10 15:40:25 +00002332/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2333#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2334
2335/* Mask to quickly check whether a C 'long' contains a
2336 non-ASCII, UTF8-encoded char. */
2337#if (SIZEOF_LONG == 8)
2338# define ASCII_CHAR_MASK 0x8080808080808080L
2339#elif (SIZEOF_LONG == 4)
2340# define ASCII_CHAR_MASK 0x80808080L
2341#else
2342# error C 'long' size should be either 4 or 8!
2343#endif
2344
Walter Dörwald69652032004-09-07 20:24:22 +00002345PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002346 Py_ssize_t size,
2347 const char *errors,
2348 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002349{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002350 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002352 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002353 Py_ssize_t startinpos;
2354 Py_ssize_t endinpos;
2355 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002356 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 PyUnicodeObject *unicode;
2358 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002359 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 PyObject *errorHandler = NULL;
2361 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362
2363 /* Note: size will always be longer than the resulting Unicode
2364 character count */
2365 unicode = _PyUnicode_New(size);
2366 if (!unicode)
2367 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002368 if (size == 0) {
2369 if (consumed)
2370 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 /* Unpack UTF-8 encoded data */
2375 p = unicode->str;
2376 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002377 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378
2379 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002380 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002383 /* Fast path for runs of ASCII characters. Given that common UTF-8
2384 input will consist of an overwhelming majority of ASCII
2385 characters, we try to optimize for this case by checking
2386 as many characters as a C 'long' can contain.
2387 First, check if we can do an aligned read, as most CPUs have
2388 a penalty for unaligned reads.
2389 */
2390 if (!((size_t) s & LONG_PTR_MASK)) {
2391 /* Help register allocation */
2392 register const char *_s = s;
2393 register Py_UNICODE *_p = p;
2394 while (_s < aligned_end) {
2395 /* Read a whole long at a time (either 4 or 8 bytes),
2396 and do a fast unrolled copy if it only contains ASCII
2397 characters. */
2398 unsigned long data = *(unsigned long *) _s;
2399 if (data & ASCII_CHAR_MASK)
2400 break;
2401 _p[0] = (unsigned char) _s[0];
2402 _p[1] = (unsigned char) _s[1];
2403 _p[2] = (unsigned char) _s[2];
2404 _p[3] = (unsigned char) _s[3];
2405#if (SIZEOF_LONG == 8)
2406 _p[4] = (unsigned char) _s[4];
2407 _p[5] = (unsigned char) _s[5];
2408 _p[6] = (unsigned char) _s[6];
2409 _p[7] = (unsigned char) _s[7];
2410#endif
2411 _s += SIZEOF_LONG;
2412 _p += SIZEOF_LONG;
2413 }
2414 s = _s;
2415 p = _p;
2416 if (s == e)
2417 break;
2418 ch = (unsigned char)*s;
2419 }
2420 }
2421
2422 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002423 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 s++;
2425 continue;
2426 }
2427
2428 n = utf8_code_length[ch];
2429
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002430 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 if (consumed)
2432 break;
2433 else {
2434 errmsg = "unexpected end of data";
2435 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002436 endinpos = startinpos+1;
2437 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2438 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 goto utf8Error;
2440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442
2443 switch (n) {
2444
2445 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002446 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 startinpos = s-starts;
2448 endinpos = startinpos+1;
2449 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450
2451 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002452 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 startinpos = s-starts;
2454 endinpos = startinpos+1;
2455 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456
2457 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002458 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002459 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002460 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002461 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002462 goto utf8Error;
2463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002465 assert ((ch > 0x007F) && (ch <= 0x07FF));
2466 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 break;
2468
2469 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002470 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2471 will result in surrogates in range d800-dfff. Surrogates are
2472 not valid UTF-8 so they are rejected.
2473 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2474 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002475 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002476 (s[2] & 0xc0) != 0x80 ||
2477 ((unsigned char)s[0] == 0xE0 &&
2478 (unsigned char)s[1] < 0xA0) ||
2479 ((unsigned char)s[0] == 0xED &&
2480 (unsigned char)s[1] > 0x9F)) {
2481 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002482 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002483 endinpos = startinpos + 1;
2484
2485 /* if s[1] first two bits are 1 and 0, then the invalid
2486 continuation byte is s[2], so increment endinpos by 1,
2487 if not, s[1] is invalid and endinpos doesn't need to
2488 be incremented. */
2489 if ((s[1] & 0xC0) == 0x80)
2490 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002491 goto utf8Error;
2492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002494 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2495 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002496 break;
2497
2498 case 4:
2499 if ((s[1] & 0xc0) != 0x80 ||
2500 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002501 (s[3] & 0xc0) != 0x80 ||
2502 ((unsigned char)s[0] == 0xF0 &&
2503 (unsigned char)s[1] < 0x90) ||
2504 ((unsigned char)s[0] == 0xF4 &&
2505 (unsigned char)s[1] > 0x8F)) {
2506 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002507 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002508 endinpos = startinpos + 1;
2509 if ((s[1] & 0xC0) == 0x80) {
2510 endinpos++;
2511 if ((s[2] & 0xC0) == 0x80)
2512 endinpos++;
2513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 goto utf8Error;
2515 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002516 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002517 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2518 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2519
Fredrik Lundh8f455852001-06-27 18:59:43 +00002520#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002521 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002522#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002523 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002524
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002525 /* translate from 10000..10FFFF to 0..FFFF */
2526 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002527
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002528 /* high surrogate = top 10 bits added to D800 */
2529 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002530
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002531 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002532 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002533#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
2536 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002537 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002538
Benjamin Peterson29060642009-01-31 22:14:21 +00002539 utf8Error:
2540 outpos = p-PyUnicode_AS_UNICODE(unicode);
2541 if (unicode_decode_call_errorhandler(
2542 errors, &errorHandler,
2543 "utf8", errmsg,
2544 &starts, &e, &startinpos, &endinpos, &exc, &s,
2545 &unicode, &outpos, &p))
2546 goto onError;
2547 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 }
Walter Dörwald69652032004-09-07 20:24:22 +00002549 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
2552 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 goto onError;
2555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 Py_XDECREF(errorHandler);
2557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 return (PyObject *)unicode;
2559
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 Py_DECREF(unicode);
2564 return NULL;
2565}
2566
Antoine Pitrouab868312009-01-10 15:40:25 +00002567#undef ASCII_CHAR_MASK
2568
2569
Tim Peters602f7402002-04-27 18:03:26 +00002570/* Allocation strategy: if the string is short, convert into a stack buffer
2571 and allocate exactly as much space needed at the end. Else allocate the
2572 maximum possible needed (4 result bytes per Unicode character), and return
2573 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002574*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002575PyObject *
2576PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002577 Py_ssize_t size,
2578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579{
Tim Peters602f7402002-04-27 18:03:26 +00002580#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002581
Guido van Rossum98297ee2007-11-06 21:34:58 +00002582 Py_ssize_t i; /* index into s of next input byte */
2583 PyObject *result; /* result string object */
2584 char *p; /* next free byte in output buffer */
2585 Py_ssize_t nallocated; /* number of result bytes allocated */
2586 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002587 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002588 PyObject *errorHandler = NULL;
2589 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002590
Tim Peters602f7402002-04-27 18:03:26 +00002591 assert(s != NULL);
2592 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593
Tim Peters602f7402002-04-27 18:03:26 +00002594 if (size <= MAX_SHORT_UNICHARS) {
2595 /* Write into the stack buffer; nallocated can't overflow.
2596 * At the end, we'll allocate exactly as much heap space as it
2597 * turns out we need.
2598 */
2599 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002600 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002601 p = stackbuf;
2602 }
2603 else {
2604 /* Overallocate on the heap, and give the excess back at the end. */
2605 nallocated = size * 4;
2606 if (nallocated / 4 != size) /* overflow! */
2607 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002608 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002609 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002610 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002611 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002612 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002613
Tim Peters602f7402002-04-27 18:03:26 +00002614 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002615 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002616
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002617 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002618 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002620
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002622 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002623 *p++ = (char)(0xc0 | (ch >> 6));
2624 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002625 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002626#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002627 /* Special case: check for high and low surrogate */
2628 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2629 Py_UCS4 ch2 = s[i];
2630 /* Combine the two surrogates to form a UCS4 value */
2631 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2632 i++;
2633
2634 /* Encode UCS4 Unicode ordinals */
2635 *p++ = (char)(0xf0 | (ch >> 18));
2636 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002637 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2638 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002639 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002640#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002641 Py_ssize_t newpos;
2642 PyObject *rep;
2643 Py_ssize_t repsize, k;
2644 rep = unicode_encode_call_errorhandler
2645 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2646 s, size, &exc, i-1, i, &newpos);
2647 if (!rep)
2648 goto error;
2649
2650 if (PyBytes_Check(rep))
2651 repsize = PyBytes_GET_SIZE(rep);
2652 else
2653 repsize = PyUnicode_GET_SIZE(rep);
2654
2655 if (repsize > 4) {
2656 Py_ssize_t offset;
2657
2658 if (result == NULL)
2659 offset = p - stackbuf;
2660 else
2661 offset = p - PyBytes_AS_STRING(result);
2662
2663 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2664 /* integer overflow */
2665 PyErr_NoMemory();
2666 goto error;
2667 }
2668 nallocated += repsize - 4;
2669 if (result != NULL) {
2670 if (_PyBytes_Resize(&result, nallocated) < 0)
2671 goto error;
2672 } else {
2673 result = PyBytes_FromStringAndSize(NULL, nallocated);
2674 if (result == NULL)
2675 goto error;
2676 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2677 }
2678 p = PyBytes_AS_STRING(result) + offset;
2679 }
2680
2681 if (PyBytes_Check(rep)) {
2682 char *prep = PyBytes_AS_STRING(rep);
2683 for(k = repsize; k > 0; k--)
2684 *p++ = *prep++;
2685 } else /* rep is unicode */ {
2686 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2687 Py_UNICODE c;
2688
2689 for(k=0; k<repsize; k++) {
2690 c = prep[k];
2691 if (0x80 <= c) {
2692 raise_encode_exception(&exc, "utf-8", s, size,
2693 i-1, i, "surrogates not allowed");
2694 goto error;
2695 }
2696 *p++ = (char)prep[k];
2697 }
2698 }
2699 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002700#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002701 }
Victor Stinner445a6232010-04-22 20:01:57 +00002702#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002703 } else if (ch < 0x10000) {
2704 *p++ = (char)(0xe0 | (ch >> 12));
2705 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2706 *p++ = (char)(0x80 | (ch & 0x3f));
2707 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002708 /* Encode UCS4 Unicode ordinals */
2709 *p++ = (char)(0xf0 | (ch >> 18));
2710 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2711 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2712 *p++ = (char)(0x80 | (ch & 0x3f));
2713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002715
Guido van Rossum98297ee2007-11-06 21:34:58 +00002716 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002717 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002718 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002719 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002720 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002721 }
2722 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002723 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002724 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002725 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002726 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002727 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002730 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002731 error:
2732 Py_XDECREF(errorHandler);
2733 Py_XDECREF(exc);
2734 Py_XDECREF(result);
2735 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002736
Tim Peters602f7402002-04-27 18:03:26 +00002737#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738}
2739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2741{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 if (!PyUnicode_Check(unicode)) {
2743 PyErr_BadArgument();
2744 return NULL;
2745 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002746 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 PyUnicode_GET_SIZE(unicode),
2748 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749}
2750
Walter Dörwald41980ca2007-08-16 21:55:45 +00002751/* --- UTF-32 Codec ------------------------------------------------------- */
2752
2753PyObject *
2754PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 Py_ssize_t size,
2756 const char *errors,
2757 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002758{
2759 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2760}
2761
2762PyObject *
2763PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 Py_ssize_t size,
2765 const char *errors,
2766 int *byteorder,
2767 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002768{
2769 const char *starts = s;
2770 Py_ssize_t startinpos;
2771 Py_ssize_t endinpos;
2772 Py_ssize_t outpos;
2773 PyUnicodeObject *unicode;
2774 Py_UNICODE *p;
2775#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002776 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002777 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778#else
2779 const int pairs = 0;
2780#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002781 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782 int bo = 0; /* assume native ordering by default */
2783 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784 /* Offsets from q for retrieving bytes in the right order. */
2785#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2786 int iorder[] = {0, 1, 2, 3};
2787#else
2788 int iorder[] = {3, 2, 1, 0};
2789#endif
2790 PyObject *errorHandler = NULL;
2791 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002792
Walter Dörwald41980ca2007-08-16 21:55:45 +00002793 q = (unsigned char *)s;
2794 e = q + size;
2795
2796 if (byteorder)
2797 bo = *byteorder;
2798
2799 /* Check for BOM marks (U+FEFF) in the input and adjust current
2800 byte order setting accordingly. In native mode, the leading BOM
2801 mark is skipped, in all other modes, it is copied to the output
2802 stream as-is (giving a ZWNBSP character). */
2803 if (bo == 0) {
2804 if (size >= 4) {
2805 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 if (bom == 0x0000FEFF) {
2809 q += 4;
2810 bo = -1;
2811 }
2812 else if (bom == 0xFFFE0000) {
2813 q += 4;
2814 bo = 1;
2815 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002816#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 if (bom == 0x0000FEFF) {
2818 q += 4;
2819 bo = 1;
2820 }
2821 else if (bom == 0xFFFE0000) {
2822 q += 4;
2823 bo = -1;
2824 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002827 }
2828
2829 if (bo == -1) {
2830 /* force LE */
2831 iorder[0] = 0;
2832 iorder[1] = 1;
2833 iorder[2] = 2;
2834 iorder[3] = 3;
2835 }
2836 else if (bo == 1) {
2837 /* force BE */
2838 iorder[0] = 3;
2839 iorder[1] = 2;
2840 iorder[2] = 1;
2841 iorder[3] = 0;
2842 }
2843
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002844 /* On narrow builds we split characters outside the BMP into two
2845 codepoints => count how much extra space we need. */
2846#ifndef Py_UNICODE_WIDE
2847 for (qq = q; qq < e; qq += 4)
2848 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2849 pairs++;
2850#endif
2851
2852 /* This might be one to much, because of a BOM */
2853 unicode = _PyUnicode_New((size+3)/4+pairs);
2854 if (!unicode)
2855 return NULL;
2856 if (size == 0)
2857 return (PyObject *)unicode;
2858
2859 /* Unpack UTF-32 encoded data */
2860 p = unicode->str;
2861
Walter Dörwald41980ca2007-08-16 21:55:45 +00002862 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 Py_UCS4 ch;
2864 /* remaining bytes at the end? (size should be divisible by 4) */
2865 if (e-q<4) {
2866 if (consumed)
2867 break;
2868 errmsg = "truncated data";
2869 startinpos = ((const char *)q)-starts;
2870 endinpos = ((const char *)e)-starts;
2871 goto utf32Error;
2872 /* The remaining input chars are ignored if the callback
2873 chooses to skip the input */
2874 }
2875 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2876 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002877
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 if (ch >= 0x110000)
2879 {
2880 errmsg = "codepoint not in range(0x110000)";
2881 startinpos = ((const char *)q)-starts;
2882 endinpos = startinpos+4;
2883 goto utf32Error;
2884 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002885#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 if (ch >= 0x10000)
2887 {
2888 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2889 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2890 }
2891 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002892#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 *p++ = ch;
2894 q += 4;
2895 continue;
2896 utf32Error:
2897 outpos = p-PyUnicode_AS_UNICODE(unicode);
2898 if (unicode_decode_call_errorhandler(
2899 errors, &errorHandler,
2900 "utf32", errmsg,
2901 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2902 &unicode, &outpos, &p))
2903 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002904 }
2905
2906 if (byteorder)
2907 *byteorder = bo;
2908
2909 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002911
2912 /* Adjust length */
2913 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2914 goto onError;
2915
2916 Py_XDECREF(errorHandler);
2917 Py_XDECREF(exc);
2918 return (PyObject *)unicode;
2919
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002921 Py_DECREF(unicode);
2922 Py_XDECREF(errorHandler);
2923 Py_XDECREF(exc);
2924 return NULL;
2925}
2926
2927PyObject *
2928PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 Py_ssize_t size,
2930 const char *errors,
2931 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002933 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002934 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002935 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002936#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002937 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002938#else
2939 const int pairs = 0;
2940#endif
2941 /* Offsets from p for storing byte pairs in the right order. */
2942#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2943 int iorder[] = {0, 1, 2, 3};
2944#else
2945 int iorder[] = {3, 2, 1, 0};
2946#endif
2947
Benjamin Peterson29060642009-01-31 22:14:21 +00002948#define STORECHAR(CH) \
2949 do { \
2950 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2951 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2952 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2953 p[iorder[0]] = (CH) & 0xff; \
2954 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002955 } while(0)
2956
2957 /* In narrow builds we can output surrogate pairs as one codepoint,
2958 so we need less space. */
2959#ifndef Py_UNICODE_WIDE
2960 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2962 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2963 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002964#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002965 nsize = (size - pairs + (byteorder == 0));
2966 bytesize = nsize * 4;
2967 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002969 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002970 if (v == NULL)
2971 return NULL;
2972
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002973 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002974 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002976 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002977 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002978
2979 if (byteorder == -1) {
2980 /* force LE */
2981 iorder[0] = 0;
2982 iorder[1] = 1;
2983 iorder[2] = 2;
2984 iorder[3] = 3;
2985 }
2986 else if (byteorder == 1) {
2987 /* force BE */
2988 iorder[0] = 3;
2989 iorder[1] = 2;
2990 iorder[2] = 1;
2991 iorder[3] = 0;
2992 }
2993
2994 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002996#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2998 Py_UCS4 ch2 = *s;
2999 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3000 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3001 s++;
3002 size--;
3003 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003004 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003005#endif
3006 STORECHAR(ch);
3007 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003008
3009 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003010 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003011#undef STORECHAR
3012}
3013
3014PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3015{
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 return NULL;
3019 }
3020 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 PyUnicode_GET_SIZE(unicode),
3022 NULL,
3023 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003024}
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026/* --- UTF-16 Codec ------------------------------------------------------- */
3027
Tim Peters772747b2001-08-09 22:21:55 +00003028PyObject *
3029PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 Py_ssize_t size,
3031 const char *errors,
3032 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033{
Walter Dörwald69652032004-09-07 20:24:22 +00003034 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3035}
3036
Antoine Pitrouab868312009-01-10 15:40:25 +00003037/* Two masks for fast checking of whether a C 'long' may contain
3038 UTF16-encoded surrogate characters. This is an efficient heuristic,
3039 assuming that non-surrogate characters with a code point >= 0x8000 are
3040 rare in most input.
3041 FAST_CHAR_MASK is used when the input is in native byte ordering,
3042 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003043*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003044#if (SIZEOF_LONG == 8)
3045# define FAST_CHAR_MASK 0x8000800080008000L
3046# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3047#elif (SIZEOF_LONG == 4)
3048# define FAST_CHAR_MASK 0x80008000L
3049# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3050#else
3051# error C 'long' size should be either 4 or 8!
3052#endif
3053
Walter Dörwald69652032004-09-07 20:24:22 +00003054PyObject *
3055PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 Py_ssize_t size,
3057 const char *errors,
3058 int *byteorder,
3059 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t startinpos;
3063 Py_ssize_t endinpos;
3064 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 PyUnicodeObject *unicode;
3066 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003067 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003068 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003069 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003070 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003071 /* Offsets from q for retrieving byte pairs in the right order. */
3072#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3073 int ihi = 1, ilo = 0;
3074#else
3075 int ihi = 0, ilo = 1;
3076#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 PyObject *errorHandler = NULL;
3078 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079
3080 /* Note: size will always be longer than the resulting Unicode
3081 character count */
3082 unicode = _PyUnicode_New(size);
3083 if (!unicode)
3084 return NULL;
3085 if (size == 0)
3086 return (PyObject *)unicode;
3087
3088 /* Unpack UTF-16 encoded data */
3089 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003090 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003091 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092
3093 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003094 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003096 /* Check for BOM marks (U+FEFF) in the input and adjust current
3097 byte order setting accordingly. In native mode, the leading BOM
3098 mark is skipped, in all other modes, it is copied to the output
3099 stream as-is (giving a ZWNBSP character). */
3100 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003101 if (size >= 2) {
3102 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003103#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 if (bom == 0xFEFF) {
3105 q += 2;
3106 bo = -1;
3107 }
3108 else if (bom == 0xFFFE) {
3109 q += 2;
3110 bo = 1;
3111 }
Tim Petersced69f82003-09-16 20:30:58 +00003112#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 if (bom == 0xFEFF) {
3114 q += 2;
3115 bo = 1;
3116 }
3117 else if (bom == 0xFFFE) {
3118 q += 2;
3119 bo = -1;
3120 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003121#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003122 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
Tim Peters772747b2001-08-09 22:21:55 +00003125 if (bo == -1) {
3126 /* force LE */
3127 ihi = 1;
3128 ilo = 0;
3129 }
3130 else if (bo == 1) {
3131 /* force BE */
3132 ihi = 0;
3133 ilo = 1;
3134 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136 native_ordering = ilo < ihi;
3137#else
3138 native_ordering = ilo > ihi;
3139#endif
Tim Peters772747b2001-08-09 22:21:55 +00003140
Antoine Pitrouab868312009-01-10 15:40:25 +00003141 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003142 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003144 /* First check for possible aligned read of a C 'long'. Unaligned
3145 reads are more expensive, better to defer to another iteration. */
3146 if (!((size_t) q & LONG_PTR_MASK)) {
3147 /* Fast path for runs of non-surrogate chars. */
3148 register const unsigned char *_q = q;
3149 Py_UNICODE *_p = p;
3150 if (native_ordering) {
3151 /* Native ordering is simple: as long as the input cannot
3152 possibly contain a surrogate char, do an unrolled copy
3153 of several 16-bit code points to the target object.
3154 The non-surrogate check is done on several input bytes
3155 at a time (as many as a C 'long' can contain). */
3156 while (_q < aligned_end) {
3157 unsigned long data = * (unsigned long *) _q;
3158 if (data & FAST_CHAR_MASK)
3159 break;
3160 _p[0] = ((unsigned short *) _q)[0];
3161 _p[1] = ((unsigned short *) _q)[1];
3162#if (SIZEOF_LONG == 8)
3163 _p[2] = ((unsigned short *) _q)[2];
3164 _p[3] = ((unsigned short *) _q)[3];
3165#endif
3166 _q += SIZEOF_LONG;
3167 _p += SIZEOF_LONG / 2;
3168 }
3169 }
3170 else {
3171 /* Byteswapped ordering is similar, but we must decompose
3172 the copy bytewise, and take care of zero'ing out the
3173 upper bytes if the target object is in 32-bit units
3174 (that is, in UCS-4 builds). */
3175 while (_q < aligned_end) {
3176 unsigned long data = * (unsigned long *) _q;
3177 if (data & SWAPPED_FAST_CHAR_MASK)
3178 break;
3179 /* Zero upper bytes in UCS-4 builds */
3180#if (Py_UNICODE_SIZE > 2)
3181 _p[0] = 0;
3182 _p[1] = 0;
3183#if (SIZEOF_LONG == 8)
3184 _p[2] = 0;
3185 _p[3] = 0;
3186#endif
3187#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003188 /* Issue #4916; UCS-4 builds on big endian machines must
3189 fill the two last bytes of each 4-byte unit. */
3190#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3191# define OFF 2
3192#else
3193# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003194#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003195 ((unsigned char *) _p)[OFF + 1] = _q[0];
3196 ((unsigned char *) _p)[OFF + 0] = _q[1];
3197 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3198 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3199#if (SIZEOF_LONG == 8)
3200 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3201 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3202 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3203 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3204#endif
3205#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003206 _q += SIZEOF_LONG;
3207 _p += SIZEOF_LONG / 2;
3208 }
3209 }
3210 p = _p;
3211 q = _q;
3212 if (q >= e)
3213 break;
3214 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216
Benjamin Peterson14339b62009-01-31 16:36:08 +00003217 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003218
3219 if (ch < 0xD800 || ch > 0xDFFF) {
3220 *p++ = ch;
3221 continue;
3222 }
3223
3224 /* UTF-16 code pair: */
3225 if (q > e) {
3226 errmsg = "unexpected end of data";
3227 startinpos = (((const char *)q) - 2) - starts;
3228 endinpos = ((const char *)e) + 1 - starts;
3229 goto utf16Error;
3230 }
3231 if (0xD800 <= ch && ch <= 0xDBFF) {
3232 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3233 q += 2;
3234 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003235#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 *p++ = ch;
3237 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003238#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003240#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 continue;
3242 }
3243 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003244 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 startinpos = (((const char *)q)-4)-starts;
3246 endinpos = startinpos+2;
3247 goto utf16Error;
3248 }
3249
Benjamin Peterson14339b62009-01-31 16:36:08 +00003250 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 errmsg = "illegal encoding";
3252 startinpos = (((const char *)q)-2)-starts;
3253 endinpos = startinpos+2;
3254 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003255
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 utf16Error:
3257 outpos = p - PyUnicode_AS_UNICODE(unicode);
3258 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003259 errors,
3260 &errorHandler,
3261 "utf16", errmsg,
3262 &starts,
3263 (const char **)&e,
3264 &startinpos,
3265 &endinpos,
3266 &exc,
3267 (const char **)&q,
3268 &unicode,
3269 &outpos,
3270 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003273 /* remaining byte at the end? (size should be even) */
3274 if (e == q) {
3275 if (!consumed) {
3276 errmsg = "truncated data";
3277 startinpos = ((const char *)q) - starts;
3278 endinpos = ((const char *)e) + 1 - starts;
3279 outpos = p - PyUnicode_AS_UNICODE(unicode);
3280 if (unicode_decode_call_errorhandler(
3281 errors,
3282 &errorHandler,
3283 "utf16", errmsg,
3284 &starts,
3285 (const char **)&e,
3286 &startinpos,
3287 &endinpos,
3288 &exc,
3289 (const char **)&q,
3290 &unicode,
3291 &outpos,
3292 &p))
3293 goto onError;
3294 /* The remaining input chars are ignored if the callback
3295 chooses to skip the input */
3296 }
3297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
3299 if (byteorder)
3300 *byteorder = bo;
3301
Walter Dörwald69652032004-09-07 20:24:22 +00003302 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003304
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003306 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 goto onError;
3308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 return (PyObject *)unicode;
3312
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 Py_XDECREF(errorHandler);
3316 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 return NULL;
3318}
3319
Antoine Pitrouab868312009-01-10 15:40:25 +00003320#undef FAST_CHAR_MASK
3321#undef SWAPPED_FAST_CHAR_MASK
3322
Tim Peters772747b2001-08-09 22:21:55 +00003323PyObject *
3324PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 Py_ssize_t size,
3326 const char *errors,
3327 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003329 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003330 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003331 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003332#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003333 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003334#else
3335 const int pairs = 0;
3336#endif
Tim Peters772747b2001-08-09 22:21:55 +00003337 /* Offsets from p for storing byte pairs in the right order. */
3338#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3339 int ihi = 1, ilo = 0;
3340#else
3341 int ihi = 0, ilo = 1;
3342#endif
3343
Benjamin Peterson29060642009-01-31 22:14:21 +00003344#define STORECHAR(CH) \
3345 do { \
3346 p[ihi] = ((CH) >> 8) & 0xff; \
3347 p[ilo] = (CH) & 0xff; \
3348 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003349 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003351#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003352 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 if (s[i] >= 0x10000)
3354 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003355#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003356 /* 2 * (size + pairs + (byteorder == 0)) */
3357 if (size > PY_SSIZE_T_MAX ||
3358 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003360 nsize = size + pairs + (byteorder == 0);
3361 bytesize = nsize * 2;
3362 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003364 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 if (v == NULL)
3366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003368 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003371 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003372 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003373
3374 if (byteorder == -1) {
3375 /* force LE */
3376 ihi = 1;
3377 ilo = 0;
3378 }
3379 else if (byteorder == 1) {
3380 /* force BE */
3381 ihi = 0;
3382 ilo = 1;
3383 }
3384
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003385 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 Py_UNICODE ch = *s++;
3387 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003388#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 if (ch >= 0x10000) {
3390 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3391 ch = 0xD800 | ((ch-0x10000) >> 10);
3392 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003393#endif
Tim Peters772747b2001-08-09 22:21:55 +00003394 STORECHAR(ch);
3395 if (ch2)
3396 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003397 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003398
3399 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003400 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003401#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402}
3403
3404PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3405{
3406 if (!PyUnicode_Check(unicode)) {
3407 PyErr_BadArgument();
3408 return NULL;
3409 }
3410 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 PyUnicode_GET_SIZE(unicode),
3412 NULL,
3413 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414}
3415
3416/* --- Unicode Escape Codec ----------------------------------------------- */
3417
Fredrik Lundh06d12682001-01-24 07:59:11 +00003418static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003419
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 Py_ssize_t size,
3422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003425 Py_ssize_t startinpos;
3426 Py_ssize_t endinpos;
3427 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003432 char* message;
3433 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 PyObject *errorHandler = NULL;
3435 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003436
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 /* Escaped strings will always be longer than the resulting
3438 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 length after conversion to the true value.
3440 (but if the error callback returns a long replacement string
3441 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 v = _PyUnicode_New(size);
3443 if (v == NULL)
3444 goto onError;
3445 if (size == 0)
3446 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003450
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 while (s < end) {
3452 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003453 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
3456 /* Non-escape characters are interpreted as Unicode ordinals */
3457 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003458 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 continue;
3460 }
3461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 /* \ - Escapes */
3464 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003465 c = *s++;
3466 if (s > end)
3467 c = '\0'; /* Invalid after \ */
3468 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 case '\n': break;
3472 case '\\': *p++ = '\\'; break;
3473 case '\'': *p++ = '\''; break;
3474 case '\"': *p++ = '\"'; break;
3475 case 'b': *p++ = '\b'; break;
3476 case 'f': *p++ = '\014'; break; /* FF */
3477 case 't': *p++ = '\t'; break;
3478 case 'n': *p++ = '\n'; break;
3479 case 'r': *p++ = '\r'; break;
3480 case 'v': *p++ = '\013'; break; /* VT */
3481 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3482
Benjamin Peterson29060642009-01-31 22:14:21 +00003483 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 case '0': case '1': case '2': case '3':
3485 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003486 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003487 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003488 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003489 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003490 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003492 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 break;
3494
Benjamin Peterson29060642009-01-31 22:14:21 +00003495 /* hex escapes */
3496 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003498 digits = 2;
3499 message = "truncated \\xXX escape";
3500 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003504 digits = 4;
3505 message = "truncated \\uXXXX escape";
3506 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507
Benjamin Peterson29060642009-01-31 22:14:21 +00003508 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003509 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003510 digits = 8;
3511 message = "truncated \\UXXXXXXXX escape";
3512 hexescape:
3513 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 outpos = p-PyUnicode_AS_UNICODE(v);
3515 if (s+digits>end) {
3516 endinpos = size;
3517 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 errors, &errorHandler,
3519 "unicodeescape", "end of string in escape sequence",
3520 &starts, &end, &startinpos, &endinpos, &exc, &s,
3521 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 goto onError;
3523 goto nextByte;
3524 }
3525 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003526 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003527 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 endinpos = (s+i+1)-starts;
3529 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003530 errors, &errorHandler,
3531 "unicodeescape", message,
3532 &starts, &end, &startinpos, &endinpos, &exc, &s,
3533 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003534 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003536 }
3537 chr = (chr<<4) & ~0xF;
3538 if (c >= '0' && c <= '9')
3539 chr += c - '0';
3540 else if (c >= 'a' && c <= 'f')
3541 chr += 10 + c - 'a';
3542 else
3543 chr += 10 + c - 'A';
3544 }
3545 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003546 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 /* _decoding_error will have already written into the
3548 target buffer. */
3549 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003550 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003551 /* when we get here, chr is a 32-bit unicode character */
3552 if (chr <= 0xffff)
3553 /* UCS-2 character */
3554 *p++ = (Py_UNICODE) chr;
3555 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003556 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003557 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003558#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003559 *p++ = chr;
3560#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003561 chr -= 0x10000L;
3562 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003563 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003564#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003565 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 endinpos = s-starts;
3567 outpos = p-PyUnicode_AS_UNICODE(v);
3568 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 errors, &errorHandler,
3570 "unicodeescape", "illegal Unicode character",
3571 &starts, &end, &startinpos, &endinpos, &exc, &s,
3572 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003573 goto onError;
3574 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003575 break;
3576
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003578 case 'N':
3579 message = "malformed \\N character escape";
3580 if (ucnhash_CAPI == NULL) {
3581 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003582 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003583 if (ucnhash_CAPI == NULL)
3584 goto ucnhashError;
3585 }
3586 if (*s == '{') {
3587 const char *start = s+1;
3588 /* look for the closing brace */
3589 while (*s != '}' && s < end)
3590 s++;
3591 if (s > start && s < end && *s == '}') {
3592 /* found a name. look it up in the unicode database */
3593 message = "unknown Unicode character name";
3594 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003595 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003596 goto store;
3597 }
3598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 endinpos = s-starts;
3600 outpos = p-PyUnicode_AS_UNICODE(v);
3601 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 errors, &errorHandler,
3603 "unicodeescape", message,
3604 &starts, &end, &startinpos, &endinpos, &exc, &s,
3605 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003606 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003607 break;
3608
3609 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003610 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 message = "\\ at end of string";
3612 s--;
3613 endinpos = s-starts;
3614 outpos = p-PyUnicode_AS_UNICODE(v);
3615 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 errors, &errorHandler,
3617 "unicodeescape", message,
3618 &starts, &end, &startinpos, &endinpos, &exc, &s,
3619 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003620 goto onError;
3621 }
3622 else {
3623 *p++ = '\\';
3624 *p++ = (unsigned char)s[-1];
3625 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003626 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003631 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003633 Py_XDECREF(errorHandler);
3634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003636
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003638 PyErr_SetString(
3639 PyExc_UnicodeError,
3640 "\\N escapes not supported (can't load unicodedata module)"
3641 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003642 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 Py_XDECREF(errorHandler);
3644 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003645 return NULL;
3646
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 Py_XDECREF(errorHandler);
3650 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 return NULL;
3652}
3653
3654/* Return a Unicode-Escape string version of the Unicode object.
3655
3656 If quotes is true, the string is enclosed in u"" or u'' quotes as
3657 appropriate.
3658
3659*/
3660
Thomas Wouters477c8d52006-05-27 19:21:47 +00003661Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 Py_ssize_t size,
3663 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003664{
3665 /* like wcschr, but doesn't stop at NULL characters */
3666
3667 while (size-- > 0) {
3668 if (*s == ch)
3669 return s;
3670 s++;
3671 }
3672
3673 return NULL;
3674}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003675
Walter Dörwald79e913e2007-05-12 11:08:06 +00003676static const char *hexdigits = "0123456789abcdef";
3677
3678PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684#ifdef Py_UNICODE_WIDE
3685 const Py_ssize_t expandsize = 10;
3686#else
3687 const Py_ssize_t expandsize = 6;
3688#endif
3689
Thomas Wouters89f507f2006-12-13 04:49:30 +00003690 /* XXX(nnorwitz): rather than over-allocating, it would be
3691 better to choose a different scheme. Perhaps scan the
3692 first N-chars of the string and allocate based on that size.
3693 */
3694 /* Initial allocation is based on the longest-possible unichr
3695 escape.
3696
3697 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3698 unichr, so in this case it's the longest unichr escape. In
3699 narrow (UTF-16) builds this is five chars per source unichr
3700 since there are two unichrs in the surrogate pair, so in narrow
3701 (UTF-16) builds it's not the longest unichr escape.
3702
3703 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3704 so in the narrow (UTF-16) build case it's the longest unichr
3705 escape.
3706 */
3707
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003708 if (size == 0)
3709 return PyBytes_FromStringAndSize(NULL, 0);
3710
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003711 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003713
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003714 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 2
3716 + expandsize*size
3717 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 if (repr == NULL)
3719 return NULL;
3720
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003721 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 while (size-- > 0) {
3724 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003725
Walter Dörwald79e913e2007-05-12 11:08:06 +00003726 /* Escape backslashes */
3727 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 *p++ = '\\';
3729 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003730 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003731 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003732
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003733#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003734 /* Map 21-bit characters to '\U00xxxxxx' */
3735 else if (ch >= 0x10000) {
3736 *p++ = '\\';
3737 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003738 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3739 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3740 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3741 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3742 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3743 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3744 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3745 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003747 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003748#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3750 else if (ch >= 0xD800 && ch < 0xDC00) {
3751 Py_UNICODE ch2;
3752 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003753
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 ch2 = *s++;
3755 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003756 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3758 *p++ = '\\';
3759 *p++ = 'U';
3760 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3761 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3762 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3763 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3764 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3765 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3766 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3767 *p++ = hexdigits[ucs & 0x0000000F];
3768 continue;
3769 }
3770 /* Fall through: isolated surrogates are copied as-is */
3771 s--;
3772 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003773 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003774#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003777 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 *p++ = '\\';
3779 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003780 *p++ = hexdigits[(ch >> 12) & 0x000F];
3781 *p++ = hexdigits[(ch >> 8) & 0x000F];
3782 *p++ = hexdigits[(ch >> 4) & 0x000F];
3783 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003785
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003786 /* Map special whitespace to '\t', \n', '\r' */
3787 else if (ch == '\t') {
3788 *p++ = '\\';
3789 *p++ = 't';
3790 }
3791 else if (ch == '\n') {
3792 *p++ = '\\';
3793 *p++ = 'n';
3794 }
3795 else if (ch == '\r') {
3796 *p++ = '\\';
3797 *p++ = 'r';
3798 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003799
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003800 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003801 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003803 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003804 *p++ = hexdigits[(ch >> 4) & 0x000F];
3805 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003806 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 /* Copy everything else as-is */
3809 else
3810 *p++ = (char) ch;
3811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003813 assert(p - PyBytes_AS_STRING(repr) > 0);
3814 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3815 return NULL;
3816 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817}
3818
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003819PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003821 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003826 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3827 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003828 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829}
3830
3831/* --- Raw Unicode Escape Codec ------------------------------------------- */
3832
3833PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 Py_ssize_t size,
3835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003838 Py_ssize_t startinpos;
3839 Py_ssize_t endinpos;
3840 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 const char *end;
3844 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 PyObject *errorHandler = NULL;
3846 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 /* Escaped strings will always be longer than the resulting
3849 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 length after conversion to the true value. (But decoding error
3851 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 v = _PyUnicode_New(size);
3853 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003856 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 end = s + size;
3859 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 unsigned char c;
3861 Py_UCS4 x;
3862 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003863 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 /* Non-escape characters are interpreted as Unicode ordinals */
3866 if (*s != '\\') {
3867 *p++ = (unsigned char)*s++;
3868 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003869 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003870 startinpos = s-starts;
3871
3872 /* \u-escapes are only interpreted iff the number of leading
3873 backslashes if odd */
3874 bs = s;
3875 for (;s < end;) {
3876 if (*s != '\\')
3877 break;
3878 *p++ = (unsigned char)*s++;
3879 }
3880 if (((s - bs) & 1) == 0 ||
3881 s >= end ||
3882 (*s != 'u' && *s != 'U')) {
3883 continue;
3884 }
3885 p--;
3886 count = *s=='u' ? 4 : 8;
3887 s++;
3888
3889 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3890 outpos = p-PyUnicode_AS_UNICODE(v);
3891 for (x = 0, i = 0; i < count; ++i, ++s) {
3892 c = (unsigned char)*s;
3893 if (!ISXDIGIT(c)) {
3894 endinpos = s-starts;
3895 if (unicode_decode_call_errorhandler(
3896 errors, &errorHandler,
3897 "rawunicodeescape", "truncated \\uXXXX",
3898 &starts, &end, &startinpos, &endinpos, &exc, &s,
3899 &v, &outpos, &p))
3900 goto onError;
3901 goto nextByte;
3902 }
3903 x = (x<<4) & ~0xF;
3904 if (c >= '0' && c <= '9')
3905 x += c - '0';
3906 else if (c >= 'a' && c <= 'f')
3907 x += 10 + c - 'a';
3908 else
3909 x += 10 + c - 'A';
3910 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003911 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 /* UCS-2 character */
3913 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003914 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003915 /* UCS-4 character. Either store directly, or as
3916 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003917#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003919#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 x -= 0x10000L;
3921 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3922 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003923#endif
3924 } else {
3925 endinpos = s-starts;
3926 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003927 if (unicode_decode_call_errorhandler(
3928 errors, &errorHandler,
3929 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 &starts, &end, &startinpos, &endinpos, &exc, &s,
3931 &v, &outpos, &p))
3932 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003933 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 nextByte:
3935 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003937 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 Py_XDECREF(errorHandler);
3940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003942
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 Py_XDECREF(errorHandler);
3946 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 return NULL;
3948}
3949
3950PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003953 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 char *p;
3955 char *q;
3956
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003957#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003958 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003959#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003960 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003961#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003962
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003963 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003966 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 if (repr == NULL)
3968 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003969 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003970 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003972 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 while (size-- > 0) {
3974 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003975#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 /* Map 32-bit characters to '\Uxxxxxxxx' */
3977 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003978 *p++ = '\\';
3979 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003980 *p++ = hexdigits[(ch >> 28) & 0xf];
3981 *p++ = hexdigits[(ch >> 24) & 0xf];
3982 *p++ = hexdigits[(ch >> 20) & 0xf];
3983 *p++ = hexdigits[(ch >> 16) & 0xf];
3984 *p++ = hexdigits[(ch >> 12) & 0xf];
3985 *p++ = hexdigits[(ch >> 8) & 0xf];
3986 *p++ = hexdigits[(ch >> 4) & 0xf];
3987 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003988 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003989 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003990#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3992 if (ch >= 0xD800 && ch < 0xDC00) {
3993 Py_UNICODE ch2;
3994 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003995
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 ch2 = *s++;
3997 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003998 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4000 *p++ = '\\';
4001 *p++ = 'U';
4002 *p++ = hexdigits[(ucs >> 28) & 0xf];
4003 *p++ = hexdigits[(ucs >> 24) & 0xf];
4004 *p++ = hexdigits[(ucs >> 20) & 0xf];
4005 *p++ = hexdigits[(ucs >> 16) & 0xf];
4006 *p++ = hexdigits[(ucs >> 12) & 0xf];
4007 *p++ = hexdigits[(ucs >> 8) & 0xf];
4008 *p++ = hexdigits[(ucs >> 4) & 0xf];
4009 *p++ = hexdigits[ucs & 0xf];
4010 continue;
4011 }
4012 /* Fall through: isolated surrogates are copied as-is */
4013 s--;
4014 size++;
4015 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004016#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 /* Map 16-bit characters to '\uxxxx' */
4018 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 *p++ = '\\';
4020 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004021 *p++ = hexdigits[(ch >> 12) & 0xf];
4022 *p++ = hexdigits[(ch >> 8) & 0xf];
4023 *p++ = hexdigits[(ch >> 4) & 0xf];
4024 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 /* Copy everything else as-is */
4027 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 *p++ = (char) ch;
4029 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004030 size = p - q;
4031
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004032 assert(size > 0);
4033 if (_PyBytes_Resize(&repr, size) < 0)
4034 return NULL;
4035 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036}
4037
4038PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4039{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004040 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004042 PyErr_BadArgument();
4043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004045 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4046 PyUnicode_GET_SIZE(unicode));
4047
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004048 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049}
4050
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004051/* --- Unicode Internal Codec ------------------------------------------- */
4052
4053PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 Py_ssize_t size,
4055 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004056{
4057 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 Py_ssize_t startinpos;
4059 Py_ssize_t endinpos;
4060 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004061 PyUnicodeObject *v;
4062 Py_UNICODE *p;
4063 const char *end;
4064 const char *reason;
4065 PyObject *errorHandler = NULL;
4066 PyObject *exc = NULL;
4067
Neal Norwitzd43069c2006-01-08 01:12:10 +00004068#ifdef Py_UNICODE_WIDE
4069 Py_UNICODE unimax = PyUnicode_GetMax();
4070#endif
4071
Thomas Wouters89f507f2006-12-13 04:49:30 +00004072 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004073 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4074 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004076 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004078 p = PyUnicode_AS_UNICODE(v);
4079 end = s + size;
4080
4081 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004082 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004083 /* We have to sanity check the raw data, otherwise doom looms for
4084 some malformed UCS-4 data. */
4085 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004086#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004087 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004088#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004089 end-s < Py_UNICODE_SIZE
4090 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004092 startinpos = s - starts;
4093 if (end-s < Py_UNICODE_SIZE) {
4094 endinpos = end-starts;
4095 reason = "truncated input";
4096 }
4097 else {
4098 endinpos = s - starts + Py_UNICODE_SIZE;
4099 reason = "illegal code point (> 0x10FFFF)";
4100 }
4101 outpos = p - PyUnicode_AS_UNICODE(v);
4102 if (unicode_decode_call_errorhandler(
4103 errors, &errorHandler,
4104 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004106 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004107 goto onError;
4108 }
4109 }
4110 else {
4111 p++;
4112 s += Py_UNICODE_SIZE;
4113 }
4114 }
4115
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004116 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004117 goto onError;
4118 Py_XDECREF(errorHandler);
4119 Py_XDECREF(exc);
4120 return (PyObject *)v;
4121
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004123 Py_XDECREF(v);
4124 Py_XDECREF(errorHandler);
4125 Py_XDECREF(exc);
4126 return NULL;
4127}
4128
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129/* --- Latin-1 Codec ------------------------------------------------------ */
4130
4131PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 Py_ssize_t size,
4133 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
4135 PyUnicodeObject *v;
4136 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004137 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004140 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 Py_UNICODE r = *(unsigned char*)s;
4142 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004143 }
4144
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 v = _PyUnicode_New(size);
4146 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004151 e = s + size;
4152 /* Unrolling the copy makes it much faster by reducing the looping
4153 overhead. This is similar to what many memcpy() implementations do. */
4154 unrolled_end = e - 4;
4155 while (s < unrolled_end) {
4156 p[0] = (unsigned char) s[0];
4157 p[1] = (unsigned char) s[1];
4158 p[2] = (unsigned char) s[2];
4159 p[3] = (unsigned char) s[3];
4160 s += 4;
4161 p += 4;
4162 }
4163 while (s < e)
4164 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004166
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 Py_XDECREF(v);
4169 return NULL;
4170}
4171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172/* create or adjust a UnicodeEncodeError */
4173static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 const char *encoding,
4175 const Py_UNICODE *unicode, Py_ssize_t size,
4176 Py_ssize_t startpos, Py_ssize_t endpos,
4177 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 *exceptionObject = PyUnicodeEncodeError_Create(
4181 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 }
4183 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4185 goto onError;
4186 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4187 goto onError;
4188 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4189 goto onError;
4190 return;
4191 onError:
4192 Py_DECREF(*exceptionObject);
4193 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 }
4195}
4196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197/* raises a UnicodeEncodeError */
4198static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 const char *encoding,
4200 const Py_UNICODE *unicode, Py_ssize_t size,
4201 Py_ssize_t startpos, Py_ssize_t endpos,
4202 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203{
4204 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208}
4209
4210/* error handling callback helper:
4211 build arguments, call the callback and check the arguments,
4212 put the result into newpos and return the replacement string, which
4213 has to be freed by the caller */
4214static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 PyObject **errorHandler,
4216 const char *encoding, const char *reason,
4217 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4218 Py_ssize_t startpos, Py_ssize_t endpos,
4219 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004221 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222
4223 PyObject *restuple;
4224 PyObject *resunicode;
4225
4226 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 }
4231
4232 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236
4237 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004242 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 Py_DECREF(restuple);
4244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004246 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 &resunicode, newpos)) {
4248 Py_DECREF(restuple);
4249 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004251 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4252 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4253 Py_DECREF(restuple);
4254 return NULL;
4255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004258 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4260 Py_DECREF(restuple);
4261 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 Py_INCREF(resunicode);
4264 Py_DECREF(restuple);
4265 return resunicode;
4266}
4267
4268static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004269 Py_ssize_t size,
4270 const char *errors,
4271 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272{
4273 /* output object */
4274 PyObject *res;
4275 /* pointers to the beginning and end+1 of input */
4276 const Py_UNICODE *startp = p;
4277 const Py_UNICODE *endp = p + size;
4278 /* pointer to the beginning of the unencodable characters */
4279 /* const Py_UNICODE *badp = NULL; */
4280 /* pointer into the output */
4281 char *str;
4282 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004284 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4285 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 PyObject *errorHandler = NULL;
4287 PyObject *exc = NULL;
4288 /* the following variable is used for caching string comparisons
4289 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4290 int known_errorHandler = -1;
4291
4292 /* allocate enough for a simple encoding without
4293 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004294 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004295 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004298 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004299 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 ressize = size;
4301
4302 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 /* can we encode this? */
4306 if (c<limit) {
4307 /* no overflow check, because we know that the space is enough */
4308 *str++ = (char)c;
4309 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004310 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 else {
4312 Py_ssize_t unicodepos = p-startp;
4313 Py_ssize_t requiredsize;
4314 PyObject *repunicode;
4315 Py_ssize_t repsize;
4316 Py_ssize_t newpos;
4317 Py_ssize_t respos;
4318 Py_UNICODE *uni2;
4319 /* startpos for collecting unencodable chars */
4320 const Py_UNICODE *collstart = p;
4321 const Py_UNICODE *collend = p;
4322 /* find all unecodable characters */
4323 while ((collend < endp) && ((*collend)>=limit))
4324 ++collend;
4325 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4326 if (known_errorHandler==-1) {
4327 if ((errors==NULL) || (!strcmp(errors, "strict")))
4328 known_errorHandler = 1;
4329 else if (!strcmp(errors, "replace"))
4330 known_errorHandler = 2;
4331 else if (!strcmp(errors, "ignore"))
4332 known_errorHandler = 3;
4333 else if (!strcmp(errors, "xmlcharrefreplace"))
4334 known_errorHandler = 4;
4335 else
4336 known_errorHandler = 0;
4337 }
4338 switch (known_errorHandler) {
4339 case 1: /* strict */
4340 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4341 goto onError;
4342 case 2: /* replace */
4343 while (collstart++<collend)
4344 *str++ = '?'; /* fall through */
4345 case 3: /* ignore */
4346 p = collend;
4347 break;
4348 case 4: /* xmlcharrefreplace */
4349 respos = str - PyBytes_AS_STRING(res);
4350 /* determine replacement size (temporarily (mis)uses p) */
4351 for (p = collstart, repsize = 0; p < collend; ++p) {
4352 if (*p<10)
4353 repsize += 2+1+1;
4354 else if (*p<100)
4355 repsize += 2+2+1;
4356 else if (*p<1000)
4357 repsize += 2+3+1;
4358 else if (*p<10000)
4359 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004360#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 else
4362 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004363#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 else if (*p<100000)
4365 repsize += 2+5+1;
4366 else if (*p<1000000)
4367 repsize += 2+6+1;
4368 else
4369 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004370#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 }
4372 requiredsize = respos+repsize+(endp-collend);
4373 if (requiredsize > ressize) {
4374 if (requiredsize<2*ressize)
4375 requiredsize = 2*ressize;
4376 if (_PyBytes_Resize(&res, requiredsize))
4377 goto onError;
4378 str = PyBytes_AS_STRING(res) + respos;
4379 ressize = requiredsize;
4380 }
4381 /* generate replacement (temporarily (mis)uses p) */
4382 for (p = collstart; p < collend; ++p) {
4383 str += sprintf(str, "&#%d;", (int)*p);
4384 }
4385 p = collend;
4386 break;
4387 default:
4388 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4389 encoding, reason, startp, size, &exc,
4390 collstart-startp, collend-startp, &newpos);
4391 if (repunicode == NULL)
4392 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004393 if (PyBytes_Check(repunicode)) {
4394 /* Directly copy bytes result to output. */
4395 repsize = PyBytes_Size(repunicode);
4396 if (repsize > 1) {
4397 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004398 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004399 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4400 Py_DECREF(repunicode);
4401 goto onError;
4402 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004403 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004404 ressize += repsize-1;
4405 }
4406 memcpy(str, PyBytes_AsString(repunicode), repsize);
4407 str += repsize;
4408 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004409 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004410 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 /* need more space? (at least enough for what we
4413 have+the replacement+the rest of the string, so
4414 we won't have to check space for encodable characters) */
4415 respos = str - PyBytes_AS_STRING(res);
4416 repsize = PyUnicode_GET_SIZE(repunicode);
4417 requiredsize = respos+repsize+(endp-collend);
4418 if (requiredsize > ressize) {
4419 if (requiredsize<2*ressize)
4420 requiredsize = 2*ressize;
4421 if (_PyBytes_Resize(&res, requiredsize)) {
4422 Py_DECREF(repunicode);
4423 goto onError;
4424 }
4425 str = PyBytes_AS_STRING(res) + respos;
4426 ressize = requiredsize;
4427 }
4428 /* check if there is anything unencodable in the replacement
4429 and copy it to the output */
4430 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4431 c = *uni2;
4432 if (c >= limit) {
4433 raise_encode_exception(&exc, encoding, startp, size,
4434 unicodepos, unicodepos+1, reason);
4435 Py_DECREF(repunicode);
4436 goto onError;
4437 }
4438 *str = (char)c;
4439 }
4440 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004441 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004443 }
4444 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004445 /* Resize if we allocated to much */
4446 size = str - PyBytes_AS_STRING(res);
4447 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004448 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004449 if (_PyBytes_Resize(&res, size) < 0)
4450 goto onError;
4451 }
4452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_XDECREF(errorHandler);
4454 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004455 return res;
4456
4457 onError:
4458 Py_XDECREF(res);
4459 Py_XDECREF(errorHandler);
4460 Py_XDECREF(exc);
4461 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462}
4463
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_ssize_t size,
4466 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469}
4470
4471PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4472{
4473 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 PyErr_BadArgument();
4475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
4477 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 PyUnicode_GET_SIZE(unicode),
4479 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480}
4481
4482/* --- 7-bit ASCII Codec -------------------------------------------------- */
4483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 Py_ssize_t size,
4486 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 PyUnicodeObject *v;
4490 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t startinpos;
4492 Py_ssize_t endinpos;
4493 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 const char *e;
4495 PyObject *errorHandler = NULL;
4496 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004499 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 Py_UNICODE r = *(unsigned char*)s;
4501 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004502 }
Tim Petersced69f82003-09-16 20:30:58 +00004503
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 v = _PyUnicode_New(size);
4505 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 e = s + size;
4511 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 register unsigned char c = (unsigned char)*s;
4513 if (c < 128) {
4514 *p++ = c;
4515 ++s;
4516 }
4517 else {
4518 startinpos = s-starts;
4519 endinpos = startinpos + 1;
4520 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4521 if (unicode_decode_call_errorhandler(
4522 errors, &errorHandler,
4523 "ascii", "ordinal not in range(128)",
4524 &starts, &e, &startinpos, &endinpos, &exc, &s,
4525 &v, &outpos, &p))
4526 goto onError;
4527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004529 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4531 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 Py_XDECREF(errorHandler);
4533 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004535
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 Py_XDECREF(errorHandler);
4539 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 return NULL;
4541}
4542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 Py_ssize_t size,
4545 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548}
4549
4550PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4551{
4552 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 PyErr_BadArgument();
4554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 PyUnicode_GET_SIZE(unicode),
4558 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559}
4560
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004561#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004562
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004563/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004564
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004565#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566#define NEED_RETRY
4567#endif
4568
4569/* XXX This code is limited to "true" double-byte encodings, as
4570 a) it assumes an incomplete character consists of a single byte, and
4571 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573
4574static int is_dbcs_lead_byte(const char *s, int offset)
4575{
4576 const char *curr = s + offset;
4577
4578 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 const char *prev = CharPrev(s, curr);
4580 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581 }
4582 return 0;
4583}
4584
4585/*
4586 * Decode MBCS string into unicode object. If 'final' is set, converts
4587 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4588 */
4589static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 const char *s, /* MBCS string */
4591 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004592 int final,
4593 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004594{
4595 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004596 Py_ssize_t n;
4597 DWORD usize;
4598 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599
4600 assert(size >= 0);
4601
Victor Stinner554f3f02010-06-16 23:33:54 +00004602 /* check and handle 'errors' arg */
4603 if (errors==NULL || strcmp(errors, "strict")==0)
4604 flags = MB_ERR_INVALID_CHARS;
4605 else if (strcmp(errors, "ignore")==0)
4606 flags = 0;
4607 else {
4608 PyErr_Format(PyExc_ValueError,
4609 "mbcs encoding does not support errors='%s'",
4610 errors);
4611 return -1;
4612 }
4613
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 /* Skip trailing lead-byte unless 'final' is set */
4615 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004617
4618 /* First get the size of the result */
4619 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004620 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4621 if (usize==0)
4622 goto mbcs_decode_error;
4623 } else
4624 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004625
4626 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 /* Create unicode object */
4628 *v = _PyUnicode_New(usize);
4629 if (*v == NULL)
4630 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004631 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632 }
4633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 /* Extend unicode object */
4635 n = PyUnicode_GET_SIZE(*v);
4636 if (_PyUnicode_Resize(v, n + usize) < 0)
4637 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638 }
4639
4640 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004641 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004643 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4644 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004646 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004647 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004648
4649mbcs_decode_error:
4650 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4651 we raise a UnicodeDecodeError - else it is a 'generic'
4652 windows error
4653 */
4654 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4655 /* Ideally, we should get reason from FormatMessage - this
4656 is the Windows 2000 English version of the message
4657 */
4658 PyObject *exc = NULL;
4659 const char *reason = "No mapping for the Unicode character exists "
4660 "in the target multi-byte code page.";
4661 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4662 if (exc != NULL) {
4663 PyCodec_StrictErrors(exc);
4664 Py_DECREF(exc);
4665 }
4666 } else {
4667 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4668 }
4669 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004670}
4671
4672PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 Py_ssize_t size,
4674 const char *errors,
4675 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004676{
4677 PyUnicodeObject *v = NULL;
4678 int done;
4679
4680 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004682
4683#ifdef NEED_RETRY
4684 retry:
4685 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004686 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 else
4688#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004689 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004690
4691 if (done < 0) {
4692 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004694 }
4695
4696 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004698
4699#ifdef NEED_RETRY
4700 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 s += done;
4702 size -= done;
4703 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004704 }
4705#endif
4706
4707 return (PyObject *)v;
4708}
4709
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004710PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 Py_ssize_t size,
4712 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004713{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004714 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4715}
4716
4717/*
4718 * Convert unicode into string object (MBCS).
4719 * Returns 0 if succeed, -1 otherwise.
4720 */
4721static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004723 int size, /* size of unicode */
4724 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004725{
Victor Stinner554f3f02010-06-16 23:33:54 +00004726 BOOL usedDefaultChar = FALSE;
4727 BOOL *pusedDefaultChar;
4728 int mbcssize;
4729 Py_ssize_t n;
4730 PyObject *exc = NULL;
4731 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004732
4733 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004734
Victor Stinner554f3f02010-06-16 23:33:54 +00004735 /* check and handle 'errors' arg */
4736 if (errors==NULL || strcmp(errors, "strict")==0) {
4737 flags = WC_NO_BEST_FIT_CHARS;
4738 pusedDefaultChar = &usedDefaultChar;
4739 } else if (strcmp(errors, "replace")==0) {
4740 flags = 0;
4741 pusedDefaultChar = NULL;
4742 } else {
4743 PyErr_Format(PyExc_ValueError,
4744 "mbcs encoding does not support errors='%s'",
4745 errors);
4746 return -1;
4747 }
4748
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004749 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004750 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004751 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4752 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 if (mbcssize == 0) {
4754 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4755 return -1;
4756 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004757 /* If we used a default char, then we failed! */
4758 if (pusedDefaultChar && *pusedDefaultChar)
4759 goto mbcs_encode_error;
4760 } else {
4761 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004762 }
4763
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004764 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 /* Create string object */
4766 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4767 if (*repr == NULL)
4768 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004769 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004770 }
4771 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 /* Extend string object */
4773 n = PyBytes_Size(*repr);
4774 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4775 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004776 }
4777
4778 /* Do the conversion */
4779 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004781 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4782 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4784 return -1;
4785 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004786 if (pusedDefaultChar && *pusedDefaultChar)
4787 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004788 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004789 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004790
4791mbcs_encode_error:
4792 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4793 Py_XDECREF(exc);
4794 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004795}
4796
4797PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004800{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004801 PyObject *repr = NULL;
4802 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004803
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004804#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004806 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004807 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004808 else
4809#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004810 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004811
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004812 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_XDECREF(repr);
4814 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004815 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004816
4817#ifdef NEED_RETRY
4818 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 p += INT_MAX;
4820 size -= INT_MAX;
4821 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004822 }
4823#endif
4824
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004825 return repr;
4826}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004827
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004828PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4829{
4830 if (!PyUnicode_Check(unicode)) {
4831 PyErr_BadArgument();
4832 return NULL;
4833 }
4834 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 PyUnicode_GET_SIZE(unicode),
4836 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004837}
4838
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004839#undef NEED_RETRY
4840
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004841#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004842
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843/* --- Character Mapping Codec -------------------------------------------- */
4844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 Py_ssize_t size,
4847 PyObject *mapping,
4848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t startinpos;
4852 Py_ssize_t endinpos;
4853 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 PyUnicodeObject *v;
4856 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004857 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 PyObject *errorHandler = NULL;
4859 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004860 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004861 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 /* Default to Latin-1 */
4864 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
4867 v = _PyUnicode_New(size);
4868 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004874 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 mapstring = PyUnicode_AS_UNICODE(mapping);
4876 maplen = PyUnicode_GET_SIZE(mapping);
4877 while (s < e) {
4878 unsigned char ch = *s;
4879 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 if (ch < maplen)
4882 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 if (x == 0xfffe) {
4885 /* undefined mapping */
4886 outpos = p-PyUnicode_AS_UNICODE(v);
4887 startinpos = s-starts;
4888 endinpos = startinpos+1;
4889 if (unicode_decode_call_errorhandler(
4890 errors, &errorHandler,
4891 "charmap", "character maps to <undefined>",
4892 &starts, &e, &startinpos, &endinpos, &exc, &s,
4893 &v, &outpos, &p)) {
4894 goto onError;
4895 }
4896 continue;
4897 }
4898 *p++ = x;
4899 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004900 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004901 }
4902 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 while (s < e) {
4904 unsigned char ch = *s;
4905 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004906
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4908 w = PyLong_FromLong((long)ch);
4909 if (w == NULL)
4910 goto onError;
4911 x = PyObject_GetItem(mapping, w);
4912 Py_DECREF(w);
4913 if (x == NULL) {
4914 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4915 /* No mapping found means: mapping is undefined. */
4916 PyErr_Clear();
4917 x = Py_None;
4918 Py_INCREF(x);
4919 } else
4920 goto onError;
4921 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004922
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 /* Apply mapping */
4924 if (PyLong_Check(x)) {
4925 long value = PyLong_AS_LONG(x);
4926 if (value < 0 || value > 65535) {
4927 PyErr_SetString(PyExc_TypeError,
4928 "character mapping must be in range(65536)");
4929 Py_DECREF(x);
4930 goto onError;
4931 }
4932 *p++ = (Py_UNICODE)value;
4933 }
4934 else if (x == Py_None) {
4935 /* undefined mapping */
4936 outpos = p-PyUnicode_AS_UNICODE(v);
4937 startinpos = s-starts;
4938 endinpos = startinpos+1;
4939 if (unicode_decode_call_errorhandler(
4940 errors, &errorHandler,
4941 "charmap", "character maps to <undefined>",
4942 &starts, &e, &startinpos, &endinpos, &exc, &s,
4943 &v, &outpos, &p)) {
4944 Py_DECREF(x);
4945 goto onError;
4946 }
4947 Py_DECREF(x);
4948 continue;
4949 }
4950 else if (PyUnicode_Check(x)) {
4951 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004952
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 if (targetsize == 1)
4954 /* 1-1 mapping */
4955 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004956
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 else if (targetsize > 1) {
4958 /* 1-n mapping */
4959 if (targetsize > extrachars) {
4960 /* resize first */
4961 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4962 Py_ssize_t needed = (targetsize - extrachars) + \
4963 (targetsize << 2);
4964 extrachars += needed;
4965 /* XXX overflow detection missing */
4966 if (_PyUnicode_Resize(&v,
4967 PyUnicode_GET_SIZE(v) + needed) < 0) {
4968 Py_DECREF(x);
4969 goto onError;
4970 }
4971 p = PyUnicode_AS_UNICODE(v) + oldpos;
4972 }
4973 Py_UNICODE_COPY(p,
4974 PyUnicode_AS_UNICODE(x),
4975 targetsize);
4976 p += targetsize;
4977 extrachars -= targetsize;
4978 }
4979 /* 1-0 mapping: skip the character */
4980 }
4981 else {
4982 /* wrong return value */
4983 PyErr_SetString(PyExc_TypeError,
4984 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004985 Py_DECREF(x);
4986 goto onError;
4987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 Py_DECREF(x);
4989 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 }
4992 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 Py_XDECREF(errorHandler);
4996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004998
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 Py_XDECREF(errorHandler);
5001 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 Py_XDECREF(v);
5003 return NULL;
5004}
5005
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005006/* Charmap encoding: the lookup table */
5007
5008struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 PyObject_HEAD
5010 unsigned char level1[32];
5011 int count2, count3;
5012 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005013};
5014
5015static PyObject*
5016encoding_map_size(PyObject *obj, PyObject* args)
5017{
5018 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005021}
5022
5023static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005024 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 PyDoc_STR("Return the size (in bytes) of this object") },
5026 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005027};
5028
5029static void
5030encoding_map_dealloc(PyObject* o)
5031{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005032 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005033}
5034
5035static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005036 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 "EncodingMap", /*tp_name*/
5038 sizeof(struct encoding_map), /*tp_basicsize*/
5039 0, /*tp_itemsize*/
5040 /* methods */
5041 encoding_map_dealloc, /*tp_dealloc*/
5042 0, /*tp_print*/
5043 0, /*tp_getattr*/
5044 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005045 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 0, /*tp_repr*/
5047 0, /*tp_as_number*/
5048 0, /*tp_as_sequence*/
5049 0, /*tp_as_mapping*/
5050 0, /*tp_hash*/
5051 0, /*tp_call*/
5052 0, /*tp_str*/
5053 0, /*tp_getattro*/
5054 0, /*tp_setattro*/
5055 0, /*tp_as_buffer*/
5056 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5057 0, /*tp_doc*/
5058 0, /*tp_traverse*/
5059 0, /*tp_clear*/
5060 0, /*tp_richcompare*/
5061 0, /*tp_weaklistoffset*/
5062 0, /*tp_iter*/
5063 0, /*tp_iternext*/
5064 encoding_map_methods, /*tp_methods*/
5065 0, /*tp_members*/
5066 0, /*tp_getset*/
5067 0, /*tp_base*/
5068 0, /*tp_dict*/
5069 0, /*tp_descr_get*/
5070 0, /*tp_descr_set*/
5071 0, /*tp_dictoffset*/
5072 0, /*tp_init*/
5073 0, /*tp_alloc*/
5074 0, /*tp_new*/
5075 0, /*tp_free*/
5076 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005077};
5078
5079PyObject*
5080PyUnicode_BuildEncodingMap(PyObject* string)
5081{
5082 Py_UNICODE *decode;
5083 PyObject *result;
5084 struct encoding_map *mresult;
5085 int i;
5086 int need_dict = 0;
5087 unsigned char level1[32];
5088 unsigned char level2[512];
5089 unsigned char *mlevel1, *mlevel2, *mlevel3;
5090 int count2 = 0, count3 = 0;
5091
5092 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5093 PyErr_BadArgument();
5094 return NULL;
5095 }
5096 decode = PyUnicode_AS_UNICODE(string);
5097 memset(level1, 0xFF, sizeof level1);
5098 memset(level2, 0xFF, sizeof level2);
5099
5100 /* If there isn't a one-to-one mapping of NULL to \0,
5101 or if there are non-BMP characters, we need to use
5102 a mapping dictionary. */
5103 if (decode[0] != 0)
5104 need_dict = 1;
5105 for (i = 1; i < 256; i++) {
5106 int l1, l2;
5107 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005108#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005109 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005110#endif
5111 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005112 need_dict = 1;
5113 break;
5114 }
5115 if (decode[i] == 0xFFFE)
5116 /* unmapped character */
5117 continue;
5118 l1 = decode[i] >> 11;
5119 l2 = decode[i] >> 7;
5120 if (level1[l1] == 0xFF)
5121 level1[l1] = count2++;
5122 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005123 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005124 }
5125
5126 if (count2 >= 0xFF || count3 >= 0xFF)
5127 need_dict = 1;
5128
5129 if (need_dict) {
5130 PyObject *result = PyDict_New();
5131 PyObject *key, *value;
5132 if (!result)
5133 return NULL;
5134 for (i = 0; i < 256; i++) {
5135 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005136 key = PyLong_FromLong(decode[i]);
5137 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005138 if (!key || !value)
5139 goto failed1;
5140 if (PyDict_SetItem(result, key, value) == -1)
5141 goto failed1;
5142 Py_DECREF(key);
5143 Py_DECREF(value);
5144 }
5145 return result;
5146 failed1:
5147 Py_XDECREF(key);
5148 Py_XDECREF(value);
5149 Py_DECREF(result);
5150 return NULL;
5151 }
5152
5153 /* Create a three-level trie */
5154 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5155 16*count2 + 128*count3 - 1);
5156 if (!result)
5157 return PyErr_NoMemory();
5158 PyObject_Init(result, &EncodingMapType);
5159 mresult = (struct encoding_map*)result;
5160 mresult->count2 = count2;
5161 mresult->count3 = count3;
5162 mlevel1 = mresult->level1;
5163 mlevel2 = mresult->level23;
5164 mlevel3 = mresult->level23 + 16*count2;
5165 memcpy(mlevel1, level1, 32);
5166 memset(mlevel2, 0xFF, 16*count2);
5167 memset(mlevel3, 0, 128*count3);
5168 count3 = 0;
5169 for (i = 1; i < 256; i++) {
5170 int o1, o2, o3, i2, i3;
5171 if (decode[i] == 0xFFFE)
5172 /* unmapped character */
5173 continue;
5174 o1 = decode[i]>>11;
5175 o2 = (decode[i]>>7) & 0xF;
5176 i2 = 16*mlevel1[o1] + o2;
5177 if (mlevel2[i2] == 0xFF)
5178 mlevel2[i2] = count3++;
5179 o3 = decode[i] & 0x7F;
5180 i3 = 128*mlevel2[i2] + o3;
5181 mlevel3[i3] = i;
5182 }
5183 return result;
5184}
5185
5186static int
5187encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5188{
5189 struct encoding_map *map = (struct encoding_map*)mapping;
5190 int l1 = c>>11;
5191 int l2 = (c>>7) & 0xF;
5192 int l3 = c & 0x7F;
5193 int i;
5194
5195#ifdef Py_UNICODE_WIDE
5196 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005198 }
5199#endif
5200 if (c == 0)
5201 return 0;
5202 /* level 1*/
5203 i = map->level1[l1];
5204 if (i == 0xFF) {
5205 return -1;
5206 }
5207 /* level 2*/
5208 i = map->level23[16*i+l2];
5209 if (i == 0xFF) {
5210 return -1;
5211 }
5212 /* level 3 */
5213 i = map->level23[16*map->count2 + 128*i + l3];
5214 if (i == 0) {
5215 return -1;
5216 }
5217 return i;
5218}
5219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220/* Lookup the character ch in the mapping. If the character
5221 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005222 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005223static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224{
Christian Heimes217cfd12007-12-02 14:31:20 +00005225 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 PyObject *x;
5227
5228 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 x = PyObject_GetItem(mapping, w);
5231 Py_DECREF(w);
5232 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5234 /* No mapping found means: mapping is undefined. */
5235 PyErr_Clear();
5236 x = Py_None;
5237 Py_INCREF(x);
5238 return x;
5239 } else
5240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005242 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005244 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 long value = PyLong_AS_LONG(x);
5246 if (value < 0 || value > 255) {
5247 PyErr_SetString(PyExc_TypeError,
5248 "character mapping must be in range(256)");
5249 Py_DECREF(x);
5250 return NULL;
5251 }
5252 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005254 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 /* wrong return value */
5258 PyErr_Format(PyExc_TypeError,
5259 "character mapping must return integer, bytes or None, not %.400s",
5260 x->ob_type->tp_name);
5261 Py_DECREF(x);
5262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 }
5264}
5265
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005266static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005267charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005268{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5270 /* exponentially overallocate to minimize reallocations */
5271 if (requiredsize < 2*outsize)
5272 requiredsize = 2*outsize;
5273 if (_PyBytes_Resize(outobj, requiredsize))
5274 return -1;
5275 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005276}
5277
Benjamin Peterson14339b62009-01-31 16:36:08 +00005278typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005280}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005282 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 space is available. Return a new reference to the object that
5284 was put in the output buffer, or Py_None, if the mapping was undefined
5285 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005286 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005291 PyObject *rep;
5292 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005293 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294
Christian Heimes90aa7642007-12-19 02:45:37 +00005295 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005296 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005298 if (res == -1)
5299 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 if (outsize<requiredsize)
5301 if (charmapencode_resize(outobj, outpos, requiredsize))
5302 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 outstart[(*outpos)++] = (char)res;
5305 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005306 }
5307
5308 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005309 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005311 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_DECREF(rep);
5313 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005314 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 if (PyLong_Check(rep)) {
5316 Py_ssize_t requiredsize = *outpos+1;
5317 if (outsize<requiredsize)
5318 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5319 Py_DECREF(rep);
5320 return enc_EXCEPTION;
5321 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005322 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 else {
5326 const char *repchars = PyBytes_AS_STRING(rep);
5327 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5328 Py_ssize_t requiredsize = *outpos+repsize;
5329 if (outsize<requiredsize)
5330 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5331 Py_DECREF(rep);
5332 return enc_EXCEPTION;
5333 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005334 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 memcpy(outstart + *outpos, repchars, repsize);
5336 *outpos += repsize;
5337 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005339 Py_DECREF(rep);
5340 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341}
5342
5343/* handle an error in PyUnicode_EncodeCharmap
5344 Return 0 on success, -1 on error */
5345static
5346int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005349 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005350 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351{
5352 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t repsize;
5354 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 Py_UNICODE *uni2;
5356 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005357 Py_ssize_t collstartpos = *inpos;
5358 Py_ssize_t collendpos = *inpos+1;
5359 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 char *encoding = "charmap";
5361 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005362 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 /* find all unencodable characters */
5365 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005366 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005367 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 int res = encoding_map_lookup(p[collendpos], mapping);
5369 if (res != -1)
5370 break;
5371 ++collendpos;
5372 continue;
5373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 rep = charmapencode_lookup(p[collendpos], mapping);
5376 if (rep==NULL)
5377 return -1;
5378 else if (rep!=Py_None) {
5379 Py_DECREF(rep);
5380 break;
5381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 }
5385 /* cache callback name lookup
5386 * (if not done yet, i.e. it's the first error) */
5387 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 if ((errors==NULL) || (!strcmp(errors, "strict")))
5389 *known_errorHandler = 1;
5390 else if (!strcmp(errors, "replace"))
5391 *known_errorHandler = 2;
5392 else if (!strcmp(errors, "ignore"))
5393 *known_errorHandler = 3;
5394 else if (!strcmp(errors, "xmlcharrefreplace"))
5395 *known_errorHandler = 4;
5396 else
5397 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 }
5399 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005400 case 1: /* strict */
5401 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5402 return -1;
5403 case 2: /* replace */
5404 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 x = charmapencode_output('?', mapping, res, respos);
5406 if (x==enc_EXCEPTION) {
5407 return -1;
5408 }
5409 else if (x==enc_FAILED) {
5410 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5411 return -1;
5412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413 }
5414 /* fall through */
5415 case 3: /* ignore */
5416 *inpos = collendpos;
5417 break;
5418 case 4: /* xmlcharrefreplace */
5419 /* generate replacement (temporarily (mis)uses p) */
5420 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 char buffer[2+29+1+1];
5422 char *cp;
5423 sprintf(buffer, "&#%d;", (int)p[collpos]);
5424 for (cp = buffer; *cp; ++cp) {
5425 x = charmapencode_output(*cp, mapping, res, respos);
5426 if (x==enc_EXCEPTION)
5427 return -1;
5428 else if (x==enc_FAILED) {
5429 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5430 return -1;
5431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005432 }
5433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005434 *inpos = collendpos;
5435 break;
5436 default:
5437 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 encoding, reason, p, size, exceptionObject,
5439 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005440 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005442 if (PyBytes_Check(repunicode)) {
5443 /* Directly copy bytes result to output. */
5444 Py_ssize_t outsize = PyBytes_Size(*res);
5445 Py_ssize_t requiredsize;
5446 repsize = PyBytes_Size(repunicode);
5447 requiredsize = *respos + repsize;
5448 if (requiredsize > outsize)
5449 /* Make room for all additional bytes. */
5450 if (charmapencode_resize(res, respos, requiredsize)) {
5451 Py_DECREF(repunicode);
5452 return -1;
5453 }
5454 memcpy(PyBytes_AsString(*res) + *respos,
5455 PyBytes_AsString(repunicode), repsize);
5456 *respos += repsize;
5457 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005458 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005459 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005460 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005461 /* generate replacement */
5462 repsize = PyUnicode_GET_SIZE(repunicode);
5463 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 x = charmapencode_output(*uni2, mapping, res, respos);
5465 if (x==enc_EXCEPTION) {
5466 return -1;
5467 }
5468 else if (x==enc_FAILED) {
5469 Py_DECREF(repunicode);
5470 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5471 return -1;
5472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 }
5474 *inpos = newpos;
5475 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 }
5477 return 0;
5478}
5479
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 Py_ssize_t size,
5482 PyObject *mapping,
5483 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 /* output object */
5486 PyObject *res = NULL;
5487 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 PyObject *errorHandler = NULL;
5492 PyObject *exc = NULL;
5493 /* the following variable is used for caching string comparisons
5494 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5495 * 3=ignore, 4=xmlcharrefreplace */
5496 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497
5498 /* Default to Latin-1 */
5499 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 /* allocate enough for a simple encoding without
5503 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005504 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 if (res == NULL)
5506 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005507 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 /* try to encode it */
5512 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5513 if (x==enc_EXCEPTION) /* error */
5514 goto onError;
5515 if (x==enc_FAILED) { /* unencodable character */
5516 if (charmap_encoding_error(p, size, &inpos, mapping,
5517 &exc,
5518 &known_errorHandler, &errorHandler, errors,
5519 &res, &respos)) {
5520 goto onError;
5521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005522 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 else
5524 /* done with this character => adjust input position */
5525 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005529 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005530 if (_PyBytes_Resize(&res, respos) < 0)
5531 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 Py_XDECREF(exc);
5534 Py_XDECREF(errorHandler);
5535 return res;
5536
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 Py_XDECREF(res);
5539 Py_XDECREF(exc);
5540 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 return NULL;
5542}
5543
5544PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546{
5547 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 PyErr_BadArgument();
5549 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 }
5551 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 PyUnicode_GET_SIZE(unicode),
5553 mapping,
5554 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555}
5556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557/* create or adjust a UnicodeTranslateError */
5558static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 const Py_UNICODE *unicode, Py_ssize_t size,
5560 Py_ssize_t startpos, Py_ssize_t endpos,
5561 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005564 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 }
5567 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5569 goto onError;
5570 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5571 goto onError;
5572 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5573 goto onError;
5574 return;
5575 onError:
5576 Py_DECREF(*exceptionObject);
5577 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 }
5579}
5580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581/* raises a UnicodeTranslateError */
5582static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 const Py_UNICODE *unicode, Py_ssize_t size,
5584 Py_ssize_t startpos, Py_ssize_t endpos,
5585 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586{
5587 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005589 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591}
5592
5593/* error handling callback helper:
5594 build arguments, call the callback and check the arguments,
5595 put the result into newpos and return the replacement string, which
5596 has to be freed by the caller */
5597static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 PyObject **errorHandler,
5599 const char *reason,
5600 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5601 Py_ssize_t startpos, Py_ssize_t endpos,
5602 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005604 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005606 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 PyObject *restuple;
5608 PyObject *resunicode;
5609
5610 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 }
5615
5616 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620
5621 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005626 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 Py_DECREF(restuple);
5628 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 }
5630 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 &resunicode, &i_newpos)) {
5632 Py_DECREF(restuple);
5633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005637 else
5638 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005639 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5641 Py_DECREF(restuple);
5642 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 Py_INCREF(resunicode);
5645 Py_DECREF(restuple);
5646 return resunicode;
5647}
5648
5649/* Lookup the character ch in the mapping and put the result in result,
5650 which must be decrefed by the caller.
5651 Return 0 on success, -1 on error */
5652static
5653int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5654{
Christian Heimes217cfd12007-12-02 14:31:20 +00005655 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 PyObject *x;
5657
5658 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 x = PyObject_GetItem(mapping, w);
5661 Py_DECREF(w);
5662 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5664 /* No mapping found means: use 1:1 mapping. */
5665 PyErr_Clear();
5666 *result = NULL;
5667 return 0;
5668 } else
5669 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005670 }
5671 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 *result = x;
5673 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005675 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 long value = PyLong_AS_LONG(x);
5677 long max = PyUnicode_GetMax();
5678 if (value < 0 || value > max) {
5679 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005680 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 Py_DECREF(x);
5682 return -1;
5683 }
5684 *result = x;
5685 return 0;
5686 }
5687 else if (PyUnicode_Check(x)) {
5688 *result = x;
5689 return 0;
5690 }
5691 else {
5692 /* wrong return value */
5693 PyErr_SetString(PyExc_TypeError,
5694 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 Py_DECREF(x);
5696 return -1;
5697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698}
5699/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 if not reallocate and adjust various state variables.
5701 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702static
Walter Dörwald4894c302003-10-24 14:25:28 +00005703int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005706 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005707 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* remember old output position */
5709 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5710 /* exponentially overallocate to minimize reallocations */
5711 if (requiredsize < 2 * oldsize)
5712 requiredsize = 2 * oldsize;
5713 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5714 return -1;
5715 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 }
5717 return 0;
5718}
5719/* lookup the character, put the result in the output string and adjust
5720 various state variables. Return a new reference to the object that
5721 was put in the output buffer in *result, or Py_None, if the mapping was
5722 undefined (in which case no character was written).
5723 The called must decref result.
5724 Return 0 on success, -1 on error. */
5725static
Walter Dörwald4894c302003-10-24 14:25:28 +00005726int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5728 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729{
Walter Dörwald4894c302003-10-24 14:25:28 +00005730 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* not found => default to 1:1 mapping */
5734 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 }
5736 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005738 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* no overflow check, because we know that the space is enough */
5740 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 }
5742 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5744 if (repsize==1) {
5745 /* no overflow check, because we know that the space is enough */
5746 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5747 }
5748 else if (repsize!=0) {
5749 /* more than one character */
5750 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5751 (insize - (curinp-startinp)) +
5752 repsize - 1;
5753 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5754 return -1;
5755 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5756 *outp += repsize;
5757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 }
5759 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 return 0;
5762}
5763
5764PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 Py_ssize_t size,
5766 PyObject *mapping,
5767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 /* output object */
5770 PyObject *res = NULL;
5771 /* pointers to the beginning and end+1 of input */
5772 const Py_UNICODE *startp = p;
5773 const Py_UNICODE *endp = p + size;
5774 /* pointer into the output */
5775 Py_UNICODE *str;
5776 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005777 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 char *reason = "character maps to <undefined>";
5779 PyObject *errorHandler = NULL;
5780 PyObject *exc = NULL;
5781 /* the following variable is used for caching string comparisons
5782 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5783 * 3=ignore, 4=xmlcharrefreplace */
5784 int known_errorHandler = -1;
5785
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 PyErr_BadArgument();
5788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790
5791 /* allocate enough for a simple 1:1 translation without
5792 replacements, if we need more, we'll resize */
5793 res = PyUnicode_FromUnicode(NULL, size);
5794 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* try to encode it */
5802 PyObject *x = NULL;
5803 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5804 Py_XDECREF(x);
5805 goto onError;
5806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005807 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 if (x!=Py_None) /* it worked => adjust input pointer */
5809 ++p;
5810 else { /* untranslatable character */
5811 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5812 Py_ssize_t repsize;
5813 Py_ssize_t newpos;
5814 Py_UNICODE *uni2;
5815 /* startpos for collecting untranslatable chars */
5816 const Py_UNICODE *collstart = p;
5817 const Py_UNICODE *collend = p+1;
5818 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 /* find all untranslatable characters */
5821 while (collend < endp) {
5822 if (charmaptranslate_lookup(*collend, mapping, &x))
5823 goto onError;
5824 Py_XDECREF(x);
5825 if (x!=Py_None)
5826 break;
5827 ++collend;
5828 }
5829 /* cache callback name lookup
5830 * (if not done yet, i.e. it's the first error) */
5831 if (known_errorHandler==-1) {
5832 if ((errors==NULL) || (!strcmp(errors, "strict")))
5833 known_errorHandler = 1;
5834 else if (!strcmp(errors, "replace"))
5835 known_errorHandler = 2;
5836 else if (!strcmp(errors, "ignore"))
5837 known_errorHandler = 3;
5838 else if (!strcmp(errors, "xmlcharrefreplace"))
5839 known_errorHandler = 4;
5840 else
5841 known_errorHandler = 0;
5842 }
5843 switch (known_errorHandler) {
5844 case 1: /* strict */
5845 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 case 2: /* replace */
5848 /* No need to check for space, this is a 1:1 replacement */
5849 for (coll = collstart; coll<collend; ++coll)
5850 *str++ = '?';
5851 /* fall through */
5852 case 3: /* ignore */
5853 p = collend;
5854 break;
5855 case 4: /* xmlcharrefreplace */
5856 /* generate replacement (temporarily (mis)uses p) */
5857 for (p = collstart; p < collend; ++p) {
5858 char buffer[2+29+1+1];
5859 char *cp;
5860 sprintf(buffer, "&#%d;", (int)*p);
5861 if (charmaptranslate_makespace(&res, &str,
5862 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5863 goto onError;
5864 for (cp = buffer; *cp; ++cp)
5865 *str++ = *cp;
5866 }
5867 p = collend;
5868 break;
5869 default:
5870 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5871 reason, startp, size, &exc,
5872 collstart-startp, collend-startp, &newpos);
5873 if (repunicode == NULL)
5874 goto onError;
5875 /* generate replacement */
5876 repsize = PyUnicode_GET_SIZE(repunicode);
5877 if (charmaptranslate_makespace(&res, &str,
5878 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5879 Py_DECREF(repunicode);
5880 goto onError;
5881 }
5882 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5883 *str++ = *uni2;
5884 p = startp + newpos;
5885 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005886 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005887 }
5888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 /* Resize if we allocated to much */
5890 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005891 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 if (PyUnicode_Resize(&res, respos) < 0)
5893 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 }
5895 Py_XDECREF(exc);
5896 Py_XDECREF(errorHandler);
5897 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 Py_XDECREF(res);
5901 Py_XDECREF(exc);
5902 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 PyObject *mapping,
5908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
5910 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005911
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 str = PyUnicode_FromObject(str);
5913 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 PyUnicode_GET_SIZE(str),
5917 mapping,
5918 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 Py_DECREF(str);
5920 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 Py_XDECREF(str);
5924 return NULL;
5925}
Tim Petersced69f82003-09-16 20:30:58 +00005926
Guido van Rossum9e896b32000-04-05 20:11:21 +00005927/* --- Decimal Encoder ---------------------------------------------------- */
5928
5929int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 Py_ssize_t length,
5931 char *output,
5932 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005933{
5934 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 PyObject *errorHandler = NULL;
5936 PyObject *exc = NULL;
5937 const char *encoding = "decimal";
5938 const char *reason = "invalid decimal Unicode string";
5939 /* the following variable is used for caching string comparisons
5940 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5941 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005942
5943 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 PyErr_BadArgument();
5945 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005946 }
5947
5948 p = s;
5949 end = s + length;
5950 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 register Py_UNICODE ch = *p;
5952 int decimal;
5953 PyObject *repunicode;
5954 Py_ssize_t repsize;
5955 Py_ssize_t newpos;
5956 Py_UNICODE *uni2;
5957 Py_UNICODE *collstart;
5958 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005959
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005961 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 ++p;
5963 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005964 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 decimal = Py_UNICODE_TODECIMAL(ch);
5966 if (decimal >= 0) {
5967 *output++ = '0' + decimal;
5968 ++p;
5969 continue;
5970 }
5971 if (0 < ch && ch < 256) {
5972 *output++ = (char)ch;
5973 ++p;
5974 continue;
5975 }
5976 /* All other characters are considered unencodable */
5977 collstart = p;
5978 collend = p+1;
5979 while (collend < end) {
5980 if ((0 < *collend && *collend < 256) ||
5981 !Py_UNICODE_ISSPACE(*collend) ||
5982 Py_UNICODE_TODECIMAL(*collend))
5983 break;
5984 }
5985 /* cache callback name lookup
5986 * (if not done yet, i.e. it's the first error) */
5987 if (known_errorHandler==-1) {
5988 if ((errors==NULL) || (!strcmp(errors, "strict")))
5989 known_errorHandler = 1;
5990 else if (!strcmp(errors, "replace"))
5991 known_errorHandler = 2;
5992 else if (!strcmp(errors, "ignore"))
5993 known_errorHandler = 3;
5994 else if (!strcmp(errors, "xmlcharrefreplace"))
5995 known_errorHandler = 4;
5996 else
5997 known_errorHandler = 0;
5998 }
5999 switch (known_errorHandler) {
6000 case 1: /* strict */
6001 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6002 goto onError;
6003 case 2: /* replace */
6004 for (p = collstart; p < collend; ++p)
6005 *output++ = '?';
6006 /* fall through */
6007 case 3: /* ignore */
6008 p = collend;
6009 break;
6010 case 4: /* xmlcharrefreplace */
6011 /* generate replacement (temporarily (mis)uses p) */
6012 for (p = collstart; p < collend; ++p)
6013 output += sprintf(output, "&#%d;", (int)*p);
6014 p = collend;
6015 break;
6016 default:
6017 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6018 encoding, reason, s, length, &exc,
6019 collstart-s, collend-s, &newpos);
6020 if (repunicode == NULL)
6021 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006022 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006023 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006024 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6025 Py_DECREF(repunicode);
6026 goto onError;
6027 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 /* generate replacement */
6029 repsize = PyUnicode_GET_SIZE(repunicode);
6030 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6031 Py_UNICODE ch = *uni2;
6032 if (Py_UNICODE_ISSPACE(ch))
6033 *output++ = ' ';
6034 else {
6035 decimal = Py_UNICODE_TODECIMAL(ch);
6036 if (decimal >= 0)
6037 *output++ = '0' + decimal;
6038 else if (0 < ch && ch < 256)
6039 *output++ = (char)ch;
6040 else {
6041 Py_DECREF(repunicode);
6042 raise_encode_exception(&exc, encoding,
6043 s, length, collstart-s, collend-s, reason);
6044 goto onError;
6045 }
6046 }
6047 }
6048 p = s + newpos;
6049 Py_DECREF(repunicode);
6050 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006051 }
6052 /* 0-terminate the output string */
6053 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 Py_XDECREF(exc);
6055 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006056 return 0;
6057
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 Py_XDECREF(exc);
6060 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006061 return -1;
6062}
6063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064/* --- Helpers ------------------------------------------------------------ */
6065
Eric Smith8c663262007-08-25 02:26:07 +00006066#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006067#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006068
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069#include "stringlib/count.h"
6070#include "stringlib/find.h"
6071#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006072#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006073
Eric Smith5807c412008-05-11 21:00:57 +00006074#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006075#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006076#include "stringlib/localeutil.h"
6077
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006079#define ADJUST_INDICES(start, end, len) \
6080 if (end > len) \
6081 end = len; \
6082 else if (end < 0) { \
6083 end += len; \
6084 if (end < 0) \
6085 end = 0; \
6086 } \
6087 if (start < 0) { \
6088 start += len; \
6089 if (start < 0) \
6090 start = 0; \
6091 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006092
Martin v. Löwis18e16552006-02-15 17:27:45 +00006093Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006094 PyObject *substr,
6095 Py_ssize_t start,
6096 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006098 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006099 PyUnicodeObject* str_obj;
6100 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Thomas Wouters477c8d52006-05-27 19:21:47 +00006102 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6103 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006105 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6106 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 Py_DECREF(str_obj);
6108 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Tim Petersced69f82003-09-16 20:30:58 +00006110
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006111 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006112 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006113 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6114 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 );
6116
6117 Py_DECREF(sub_obj);
6118 Py_DECREF(str_obj);
6119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 return result;
6121}
6122
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006124 PyObject *sub,
6125 Py_ssize_t start,
6126 Py_ssize_t end,
6127 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006132 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006134 sub = PyUnicode_FromObject(sub);
6135 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 Py_DECREF(str);
6137 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
Tim Petersced69f82003-09-16 20:30:58 +00006139
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140 if (direction > 0)
6141 result = stringlib_find_slice(
6142 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6143 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6144 start, end
6145 );
6146 else
6147 result = stringlib_rfind_slice(
6148 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6149 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6150 start, end
6151 );
6152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006154 Py_DECREF(sub);
6155
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 return result;
6157}
6158
Tim Petersced69f82003-09-16 20:30:58 +00006159static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 PyUnicodeObject *substring,
6162 Py_ssize_t start,
6163 Py_ssize_t end,
6164 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (substring->length == 0)
6167 return 1;
6168
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006169 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 end -= substring->length;
6171 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
6174 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 if (Py_UNICODE_MATCH(self, end, substring))
6176 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 } else {
6178 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 }
6181
6182 return 0;
6183}
6184
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 PyObject *substr,
6187 Py_ssize_t start,
6188 Py_ssize_t end,
6189 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 str = PyUnicode_FromObject(str);
6194 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 substr = PyUnicode_FromObject(substr);
6197 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 Py_DECREF(str);
6199 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 }
Tim Petersced69f82003-09-16 20:30:58 +00006201
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 (PyUnicodeObject *)substr,
6204 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 Py_DECREF(str);
6206 Py_DECREF(substr);
6207 return result;
6208}
6209
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210/* Apply fixfct filter to the Unicode object self and return a
6211 reference to the modified object */
6212
Tim Petersced69f82003-09-16 20:30:58 +00006213static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
6217
6218 PyUnicodeObject *u;
6219
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006220 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006223
6224 Py_UNICODE_COPY(u->str, self->str, self->length);
6225
Tim Peters7a29bd52001-09-12 03:03:31 +00006226 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 /* fixfct should return TRUE if it modified the buffer. If
6228 FALSE, return a reference to the original buffer instead
6229 (to save space, not time) */
6230 Py_INCREF(self);
6231 Py_DECREF(u);
6232 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 }
6234 return (PyObject*) u;
6235}
6236
Tim Petersced69f82003-09-16 20:30:58 +00006237static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238int fixupper(PyUnicodeObject *self)
6239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006240 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 Py_UNICODE *s = self->str;
6242 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006243
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006246
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 ch = Py_UNICODE_TOUPPER(*s);
6248 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 *s = ch;
6251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 s++;
6253 }
6254
6255 return status;
6256}
6257
Tim Petersced69f82003-09-16 20:30:58 +00006258static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259int fixlower(PyUnicodeObject *self)
6260{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006261 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 Py_UNICODE *s = self->str;
6263 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006264
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006267
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 ch = Py_UNICODE_TOLOWER(*s);
6269 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 *s = ch;
6272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 s++;
6274 }
6275
6276 return status;
6277}
6278
Tim Petersced69f82003-09-16 20:30:58 +00006279static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280int fixswapcase(PyUnicodeObject *self)
6281{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006282 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 Py_UNICODE *s = self->str;
6284 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006285
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 while (len-- > 0) {
6287 if (Py_UNICODE_ISUPPER(*s)) {
6288 *s = Py_UNICODE_TOLOWER(*s);
6289 status = 1;
6290 } else if (Py_UNICODE_ISLOWER(*s)) {
6291 *s = Py_UNICODE_TOUPPER(*s);
6292 status = 1;
6293 }
6294 s++;
6295 }
6296
6297 return status;
6298}
6299
Tim Petersced69f82003-09-16 20:30:58 +00006300static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301int fixcapitalize(PyUnicodeObject *self)
6302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006304 Py_UNICODE *s = self->str;
6305 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006306
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006307 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006309 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 *s = Py_UNICODE_TOUPPER(*s);
6311 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006313 s++;
6314 while (--len > 0) {
6315 if (Py_UNICODE_ISUPPER(*s)) {
6316 *s = Py_UNICODE_TOLOWER(*s);
6317 status = 1;
6318 }
6319 s++;
6320 }
6321 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
6324static
6325int fixtitle(PyUnicodeObject *self)
6326{
6327 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6328 register Py_UNICODE *e;
6329 int previous_is_cased;
6330
6331 /* Shortcut for single character strings */
6332 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6334 if (*p != ch) {
6335 *p = ch;
6336 return 1;
6337 }
6338 else
6339 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 }
Tim Petersced69f82003-09-16 20:30:58 +00006341
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 e = p + PyUnicode_GET_SIZE(self);
6343 previous_is_cased = 0;
6344 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006346
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 if (previous_is_cased)
6348 *p = Py_UNICODE_TOLOWER(ch);
6349 else
6350 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006351
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 if (Py_UNICODE_ISLOWER(ch) ||
6353 Py_UNICODE_ISUPPER(ch) ||
6354 Py_UNICODE_ISTITLE(ch))
6355 previous_is_cased = 1;
6356 else
6357 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 }
6359 return 1;
6360}
6361
Tim Peters8ce9f162004-08-27 01:49:32 +00006362PyObject *
6363PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
Skip Montanaro6543b452004-09-16 03:28:13 +00006365 const Py_UNICODE blank = ' ';
6366 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006367 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006368 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006369 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6370 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006371 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6372 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006373 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006374 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375
Tim Peters05eba1f2004-08-27 21:32:02 +00006376 fseq = PySequence_Fast(seq, "");
6377 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006378 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006379 }
6380
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006381 /* NOTE: the following code can't call back into Python code,
6382 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006383 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006384
Tim Peters05eba1f2004-08-27 21:32:02 +00006385 seqlen = PySequence_Fast_GET_SIZE(fseq);
6386 /* If empty sequence, return u"". */
6387 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006388 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6389 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006390 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006391 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006392 /* If singleton sequence with an exact Unicode, return that. */
6393 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 item = items[0];
6395 if (PyUnicode_CheckExact(item)) {
6396 Py_INCREF(item);
6397 res = (PyUnicodeObject *)item;
6398 goto Done;
6399 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006400 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006401 else {
6402 /* Set up sep and seplen */
6403 if (separator == NULL) {
6404 sep = &blank;
6405 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006406 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006407 else {
6408 if (!PyUnicode_Check(separator)) {
6409 PyErr_Format(PyExc_TypeError,
6410 "separator: expected str instance,"
6411 " %.80s found",
6412 Py_TYPE(separator)->tp_name);
6413 goto onError;
6414 }
6415 sep = PyUnicode_AS_UNICODE(separator);
6416 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006417 }
6418 }
6419
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006420 /* There are at least two things to join, or else we have a subclass
6421 * of str in the sequence.
6422 * Do a pre-pass to figure out the total amount of space we'll
6423 * need (sz), and see whether all argument are strings.
6424 */
6425 sz = 0;
6426 for (i = 0; i < seqlen; i++) {
6427 const Py_ssize_t old_sz = sz;
6428 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 if (!PyUnicode_Check(item)) {
6430 PyErr_Format(PyExc_TypeError,
6431 "sequence item %zd: expected str instance,"
6432 " %.80s found",
6433 i, Py_TYPE(item)->tp_name);
6434 goto onError;
6435 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006436 sz += PyUnicode_GET_SIZE(item);
6437 if (i != 0)
6438 sz += seplen;
6439 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6440 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006442 goto onError;
6443 }
6444 }
Tim Petersced69f82003-09-16 20:30:58 +00006445
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006446 res = _PyUnicode_New(sz);
6447 if (res == NULL)
6448 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006449
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006450 /* Catenate everything. */
6451 res_p = PyUnicode_AS_UNICODE(res);
6452 for (i = 0; i < seqlen; ++i) {
6453 Py_ssize_t itemlen;
6454 item = items[i];
6455 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* Copy item, and maybe the separator. */
6457 if (i) {
6458 Py_UNICODE_COPY(res_p, sep, seplen);
6459 res_p += seplen;
6460 }
6461 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6462 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006463 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006464
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006466 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 return (PyObject *)res;
6468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006470 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006471 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 return NULL;
6473}
6474
Tim Petersced69f82003-09-16 20:30:58 +00006475static
6476PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 Py_ssize_t left,
6478 Py_ssize_t right,
6479 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480{
6481 PyUnicodeObject *u;
6482
6483 if (left < 0)
6484 left = 0;
6485 if (right < 0)
6486 right = 0;
6487
Tim Peters7a29bd52001-09-12 03:03:31 +00006488 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 Py_INCREF(self);
6490 return self;
6491 }
6492
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006493 if (left > PY_SSIZE_T_MAX - self->length ||
6494 right > PY_SSIZE_T_MAX - (left + self->length)) {
6495 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6496 return NULL;
6497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 u = _PyUnicode_New(left + self->length + right);
6499 if (u) {
6500 if (left)
6501 Py_UNICODE_FILL(u->str, fill, left);
6502 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6503 if (right)
6504 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6505 }
6506
6507 return u;
6508}
6509
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006510PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514 string = PyUnicode_FromObject(string);
6515 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006518 list = stringlib_splitlines(
6519 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6520 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522 Py_DECREF(string);
6523 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
Tim Petersced69f82003-09-16 20:30:58 +00006526static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 PyUnicodeObject *substring,
6529 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006532 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006535 return stringlib_split_whitespace(
6536 (PyObject*) self, self->str, self->length, maxcount
6537 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006539 return stringlib_split(
6540 (PyObject*) self, self->str, self->length,
6541 substring->str, substring->length,
6542 maxcount
6543 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544}
6545
Tim Petersced69f82003-09-16 20:30:58 +00006546static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006547PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 PyUnicodeObject *substring,
6549 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006550{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006551 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006552 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006553
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006554 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006555 return stringlib_rsplit_whitespace(
6556 (PyObject*) self, self->str, self->length, maxcount
6557 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006558
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006559 return stringlib_rsplit(
6560 (PyObject*) self, self->str, self->length,
6561 substring->str, substring->length,
6562 maxcount
6563 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006564}
6565
6566static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 PyUnicodeObject *str1,
6569 PyUnicodeObject *str2,
6570 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
6572 PyUnicodeObject *u;
6573
6574 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006576 else if (maxcount == 0 || self->length == 0)
6577 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006580 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006582 if (str1->length == 0)
6583 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 if (str1->length == 1) {
6585 /* replace characters */
6586 Py_UNICODE u1, u2;
6587 if (!findchar(self->str, self->length, str1->str[0]))
6588 goto nothing;
6589 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6590 if (!u)
6591 return NULL;
6592 Py_UNICODE_COPY(u->str, self->str, self->length);
6593 u1 = str1->str[0];
6594 u2 = str2->str[0];
6595 for (i = 0; i < u->length; i++)
6596 if (u->str[i] == u1) {
6597 if (--maxcount < 0)
6598 break;
6599 u->str[i] = u2;
6600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006602 i = stringlib_find(
6603 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006605 if (i < 0)
6606 goto nothing;
6607 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6608 if (!u)
6609 return NULL;
6610 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006611
6612 /* change everything in-place, starting with this one */
6613 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6614 i += str1->length;
6615
6616 while ( --maxcount > 0) {
6617 i = stringlib_find(self->str+i, self->length-i,
6618 str1->str, str1->length,
6619 i);
6620 if (i == -1)
6621 break;
6622 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6623 i += str1->length;
6624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627
6628 Py_ssize_t n, i, j, e;
6629 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 Py_UNICODE *p;
6631
6632 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006633 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6634 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635 if (n == 0)
6636 goto nothing;
6637 /* new_size = self->length + n * (str2->length - str1->length)); */
6638 delta = (str2->length - str1->length);
6639 if (delta == 0) {
6640 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006642 product = n * (str2->length - str1->length);
6643 if ((product / (str2->length - str1->length)) != n) {
6644 PyErr_SetString(PyExc_OverflowError,
6645 "replace string is too long");
6646 return NULL;
6647 }
6648 new_size = self->length + product;
6649 if (new_size < 0) {
6650 PyErr_SetString(PyExc_OverflowError,
6651 "replace string is too long");
6652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 }
6654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006655 u = _PyUnicode_New(new_size);
6656 if (!u)
6657 return NULL;
6658 i = 0;
6659 p = u->str;
6660 e = self->length - str1->length;
6661 if (str1->length > 0) {
6662 while (n-- > 0) {
6663 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006664 j = stringlib_find(self->str+i, self->length-i,
6665 str1->str, str1->length,
6666 i);
6667 if (j == -1)
6668 break;
6669 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006670 /* copy unchanged part [i:j] */
6671 Py_UNICODE_COPY(p, self->str+i, j-i);
6672 p += j - i;
6673 }
6674 /* copy substitution string */
6675 if (str2->length > 0) {
6676 Py_UNICODE_COPY(p, str2->str, str2->length);
6677 p += str2->length;
6678 }
6679 i = j + str1->length;
6680 }
6681 if (i < self->length)
6682 /* copy tail [i:] */
6683 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6684 } else {
6685 /* interleave */
6686 while (n > 0) {
6687 Py_UNICODE_COPY(p, str2->str, str2->length);
6688 p += str2->length;
6689 if (--n <= 0)
6690 break;
6691 *p++ = self->str[i++];
6692 }
6693 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006697
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006699 /* nothing to replace; return original string (when possible) */
6700 if (PyUnicode_CheckExact(self)) {
6701 Py_INCREF(self);
6702 return (PyObject *) self;
6703 }
6704 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705}
6706
6707/* --- Unicode Object Methods --------------------------------------------- */
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
6712Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 return fixup(self, fixtitle);
6719}
6720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723\n\
6724Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006725have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
6727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006728unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 return fixup(self, fixcapitalize);
6731}
6732
6733#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736\n\
6737Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
6740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006741unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742{
6743 PyObject *list;
6744 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 /* Split into words */
6748 list = split(self, NULL, -1);
6749 if (!list)
6750 return NULL;
6751
6752 /* Capitalize each word */
6753 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6754 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (item == NULL)
6757 goto onError;
6758 Py_DECREF(PyList_GET_ITEM(list, i));
6759 PyList_SET_ITEM(list, i, item);
6760 }
6761
6762 /* Join the words to form a new string */
6763 item = PyUnicode_Join(NULL, list);
6764
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 Py_DECREF(list);
6767 return (PyObject *)item;
6768}
6769#endif
6770
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006771/* Argument converter. Coerces to a single unicode character */
6772
6773static int
6774convert_uc(PyObject *obj, void *addr)
6775{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006776 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6777 PyObject *uniobj;
6778 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006779
Benjamin Peterson14339b62009-01-31 16:36:08 +00006780 uniobj = PyUnicode_FromObject(obj);
6781 if (uniobj == NULL) {
6782 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 return 0;
6785 }
6786 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6787 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006789 Py_DECREF(uniobj);
6790 return 0;
6791 }
6792 unistr = PyUnicode_AS_UNICODE(uniobj);
6793 *fillcharloc = unistr[0];
6794 Py_DECREF(uniobj);
6795 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006801Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006802done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject *
6805unicode_center(PyUnicodeObject *self, PyObject *args)
6806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006807 Py_ssize_t marg, left;
6808 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006809 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
Thomas Woutersde017742006-02-16 19:34:37 +00006811 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 return NULL;
6813
Tim Peters7a29bd52001-09-12 03:03:31 +00006814 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 Py_INCREF(self);
6816 return (PyObject*) self;
6817 }
6818
6819 marg = width - self->length;
6820 left = marg / 2 + (marg & width & 1);
6821
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006822 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Marc-André Lemburge5034372000-08-08 08:04:29 +00006825#if 0
6826
6827/* This code should go into some future Unicode collation support
6828 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006829 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006830
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006831/* speedy UTF-16 code point order comparison */
6832/* gleaned from: */
6833/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6834
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006835static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006836{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006837 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006838 0, 0, 0, 0, 0, 0, 0, 0,
6839 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006840 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006841};
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843static int
6844unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6845{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006846 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 Py_UNICODE *s1 = str1->str;
6849 Py_UNICODE *s2 = str2->str;
6850
6851 len1 = str1->length;
6852 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006855 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006856
6857 c1 = *s1++;
6858 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006859
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 if (c1 > (1<<11) * 26)
6861 c1 += utf16Fixup[c1>>11];
6862 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006863 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006864 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006865
6866 if (c1 != c2)
6867 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006868
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006869 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
6871
6872 return (len1 < len2) ? -1 : (len1 != len2);
6873}
6874
Marc-André Lemburge5034372000-08-08 08:04:29 +00006875#else
6876
6877static int
6878unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6879{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006880 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006881
6882 Py_UNICODE *s1 = str1->str;
6883 Py_UNICODE *s2 = str2->str;
6884
6885 len1 = str1->length;
6886 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Marc-André Lemburge5034372000-08-08 08:04:29 +00006888 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006889 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006890
Fredrik Lundh45714e92001-06-26 16:39:36 +00006891 c1 = *s1++;
6892 c2 = *s2++;
6893
6894 if (c1 != c2)
6895 return (c1 < c2) ? -1 : 1;
6896
Marc-André Lemburge5034372000-08-08 08:04:29 +00006897 len1--; len2--;
6898 }
6899
6900 return (len1 < len2) ? -1 : (len1 != len2);
6901}
6902
6903#endif
6904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006908 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6909 return unicode_compare((PyUnicodeObject *)left,
6910 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006911 PyErr_Format(PyExc_TypeError,
6912 "Can't compare %.100s and %.100s",
6913 left->ob_type->tp_name,
6914 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 return -1;
6916}
6917
Martin v. Löwis5b222132007-06-10 09:51:05 +00006918int
6919PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6920{
6921 int i;
6922 Py_UNICODE *id;
6923 assert(PyUnicode_Check(uni));
6924 id = PyUnicode_AS_UNICODE(uni);
6925 /* Compare Unicode string and source character set string */
6926 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 if (id[i] != str[i])
6928 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006929 /* This check keeps Python strings that end in '\0' from comparing equal
6930 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006931 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006933 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006935 return 0;
6936}
6937
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006938
Benjamin Peterson29060642009-01-31 22:14:21 +00006939#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006941
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006942PyObject *PyUnicode_RichCompare(PyObject *left,
6943 PyObject *right,
6944 int op)
6945{
6946 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006948 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6949 PyObject *v;
6950 if (((PyUnicodeObject *) left)->length !=
6951 ((PyUnicodeObject *) right)->length) {
6952 if (op == Py_EQ) {
6953 Py_INCREF(Py_False);
6954 return Py_False;
6955 }
6956 if (op == Py_NE) {
6957 Py_INCREF(Py_True);
6958 return Py_True;
6959 }
6960 }
6961 if (left == right)
6962 result = 0;
6963 else
6964 result = unicode_compare((PyUnicodeObject *)left,
6965 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006966
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006967 /* Convert the return value to a Boolean */
6968 switch (op) {
6969 case Py_EQ:
6970 v = TEST_COND(result == 0);
6971 break;
6972 case Py_NE:
6973 v = TEST_COND(result != 0);
6974 break;
6975 case Py_LE:
6976 v = TEST_COND(result <= 0);
6977 break;
6978 case Py_GE:
6979 v = TEST_COND(result >= 0);
6980 break;
6981 case Py_LT:
6982 v = TEST_COND(result == -1);
6983 break;
6984 case Py_GT:
6985 v = TEST_COND(result == 1);
6986 break;
6987 default:
6988 PyErr_BadArgument();
6989 return NULL;
6990 }
6991 Py_INCREF(v);
6992 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006993 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006995 Py_INCREF(Py_NotImplemented);
6996 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006997}
6998
Guido van Rossum403d68b2000-03-13 15:55:09 +00006999int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007001{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007004
7005 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 sub = PyUnicode_FromObject(element);
7007 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 PyErr_Format(PyExc_TypeError,
7009 "'in <string>' requires string as left operand, not %s",
7010 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007011 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007012 }
7013
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 str = PyUnicode_FromObject(container);
7015 if (!str) {
7016 Py_DECREF(sub);
7017 return -1;
7018 }
7019
7020 result = stringlib_contains_obj(str, sub);
7021
7022 Py_DECREF(str);
7023 Py_DECREF(sub);
7024
Guido van Rossum403d68b2000-03-13 15:55:09 +00007025 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007026}
7027
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028/* Concat to string or Unicode object giving a new Unicode object. */
7029
7030PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032{
7033 PyUnicodeObject *u = NULL, *v = NULL, *w;
7034
7035 /* Coerce the two arguments */
7036 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7037 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7040 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
7043 /* Shortcuts */
7044 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 Py_DECREF(v);
7046 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 }
7048 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 Py_DECREF(u);
7050 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
7052
7053 /* Concat the two Unicode strings */
7054 w = _PyUnicode_New(u->length + v->length);
7055 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 Py_UNICODE_COPY(w->str, u->str, u->length);
7058 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7059
7060 Py_DECREF(u);
7061 Py_DECREF(v);
7062 return (PyObject *)w;
7063
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 Py_XDECREF(u);
7066 Py_XDECREF(v);
7067 return NULL;
7068}
7069
Walter Dörwald1ab83302007-05-18 17:15:44 +00007070void
7071PyUnicode_Append(PyObject **pleft, PyObject *right)
7072{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007073 PyObject *new;
7074 if (*pleft == NULL)
7075 return;
7076 if (right == NULL || !PyUnicode_Check(*pleft)) {
7077 Py_DECREF(*pleft);
7078 *pleft = NULL;
7079 return;
7080 }
7081 new = PyUnicode_Concat(*pleft, right);
7082 Py_DECREF(*pleft);
7083 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007084}
7085
7086void
7087PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007089 PyUnicode_Append(pleft, right);
7090 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007091}
7092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007093PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007097string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
7100static PyObject *
7101unicode_count(PyUnicodeObject *self, PyObject *args)
7102{
7103 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007104 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007105 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 PyObject *result;
7107
Guido van Rossumb8872e62000-05-09 14:14:27 +00007108 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 return NULL;
7111
7112 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007113 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007116
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007117 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007118 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007119 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007120 substring->str, substring->length,
7121 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007122 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123
7124 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007125
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 return result;
7127}
7128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007132Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007133to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007134handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7136'xmlcharrefreplace' as well as any other name registered with\n\
7137codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007140unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007142 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 char *encoding = NULL;
7144 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007145 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007146
Benjamin Peterson308d6372009-09-18 21:42:35 +00007147 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7148 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007150 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007151 if (v == NULL)
7152 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007153 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007154 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007155 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007156 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007157 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007158 Py_DECREF(v);
7159 return NULL;
7160 }
7161 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007162
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007164 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007165}
7166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169\n\
7170Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007171If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172
7173static PyObject*
7174unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7175{
7176 Py_UNICODE *e;
7177 Py_UNICODE *p;
7178 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007179 Py_UNICODE *qe;
7180 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 PyUnicodeObject *u;
7182 int tabsize = 8;
7183
7184 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
Thomas Wouters7e474022000-07-16 12:04:32 +00007187 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007188 i = 0; /* chars up to and including most recent \n or \r */
7189 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7190 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 for (p = self->str; p < e; p++)
7192 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 if (tabsize > 0) {
7194 incr = tabsize - (j % tabsize); /* cannot overflow */
7195 if (j > PY_SSIZE_T_MAX - incr)
7196 goto overflow1;
7197 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 if (j > PY_SSIZE_T_MAX - 1)
7202 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 j++;
7204 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 if (i > PY_SSIZE_T_MAX - j)
7206 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007208 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 }
7210 }
7211
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007212 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 /* Second pass: create output string and fill it */
7216 u = _PyUnicode_New(i + j);
7217 if (!u)
7218 return NULL;
7219
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007220 j = 0; /* same as in first pass */
7221 q = u->str; /* next output char */
7222 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
7224 for (p = self->str; p < e; p++)
7225 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 if (tabsize > 0) {
7227 i = tabsize - (j % tabsize);
7228 j += i;
7229 while (i--) {
7230 if (q >= qe)
7231 goto overflow2;
7232 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007235 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 else {
7237 if (q >= qe)
7238 goto overflow2;
7239 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007240 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 if (*p == '\n' || *p == '\r')
7242 j = 0;
7243 }
7244
7245 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007246
7247 overflow2:
7248 Py_DECREF(u);
7249 overflow1:
7250 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252}
7253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007254PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256\n\
7257Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007258such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259arguments start and end are interpreted as in slice notation.\n\
7260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263static PyObject *
7264unicode_find(PyUnicodeObject *self, PyObject *args)
7265{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007266 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007267 Py_ssize_t start;
7268 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007269 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
Christian Heimes9cd17752007-11-18 19:35:23 +00007271 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
Thomas Wouters477c8d52006-05-27 19:21:47 +00007274 result = stringlib_find_slice(
7275 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7276 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7277 start, end
7278 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007281
Christian Heimes217cfd12007-12-02 14:31:20 +00007282 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283}
7284
7285static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 if (index < 0 || index >= self->length) {
7289 PyErr_SetString(PyExc_IndexError, "string index out of range");
7290 return NULL;
7291 }
7292
7293 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7294}
7295
Guido van Rossumc2504932007-09-18 19:42:40 +00007296/* Believe it or not, this produces the same value for ASCII strings
7297 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007299unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
Guido van Rossumc2504932007-09-18 19:42:40 +00007301 Py_ssize_t len;
7302 Py_UNICODE *p;
7303 long x;
7304
7305 if (self->hash != -1)
7306 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007307 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007308 p = self->str;
7309 x = *p << 7;
7310 while (--len >= 0)
7311 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007312 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007313 if (x == -1)
7314 x = -2;
7315 self->hash = x;
7316 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317}
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324static PyObject *
7325unicode_index(PyUnicodeObject *self, PyObject *args)
7326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007327 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007328 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007329 Py_ssize_t start;
7330 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
Christian Heimes9cd17752007-11-18 19:35:23 +00007332 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
Thomas Wouters477c8d52006-05-27 19:21:47 +00007335 result = stringlib_find_slice(
7336 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7337 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7338 start, end
7339 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007342
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 if (result < 0) {
7344 PyErr_SetString(PyExc_ValueError, "substring not found");
7345 return NULL;
7346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347
Christian Heimes217cfd12007-12-02 14:31:20 +00007348 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349}
7350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007351PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007354Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007358unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359{
7360 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7361 register const Py_UNICODE *e;
7362 int cased;
7363
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 /* Shortcut for single character strings */
7365 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007368 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007369 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007371
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 e = p + PyUnicode_GET_SIZE(self);
7373 cased = 0;
7374 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007376
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7378 return PyBool_FromLong(0);
7379 else if (!cased && Py_UNICODE_ISLOWER(ch))
7380 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007382 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383}
7384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007388Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007389at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390
7391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007392unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
7394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7395 register const Py_UNICODE *e;
7396 int cased;
7397
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 /* Shortcut for single character strings */
7399 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007402 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007403 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007405
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 e = p + PyUnicode_GET_SIZE(self);
7407 cased = 0;
7408 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007410
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7412 return PyBool_FromLong(0);
7413 else if (!cased && Py_UNICODE_ISUPPER(ch))
7414 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007416 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417}
7418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007419PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007422Return True if S is a titlecased string and there is at least one\n\
7423character in S, i.e. upper- and titlecase characters may only\n\
7424follow uncased characters and lowercase characters only cased ones.\n\
7425Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
7427static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007428unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429{
7430 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7431 register const Py_UNICODE *e;
7432 int cased, previous_is_cased;
7433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 /* Shortcut for single character strings */
7435 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7437 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007439 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007440 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007442
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 e = p + PyUnicode_GET_SIZE(self);
7444 cased = 0;
7445 previous_is_cased = 0;
7446 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007448
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7450 if (previous_is_cased)
7451 return PyBool_FromLong(0);
7452 previous_is_cased = 1;
7453 cased = 1;
7454 }
7455 else if (Py_UNICODE_ISLOWER(ch)) {
7456 if (!previous_is_cased)
7457 return PyBool_FromLong(0);
7458 previous_is_cased = 1;
7459 cased = 1;
7460 }
7461 else
7462 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007464 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007467PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007470Return True if all characters in S are whitespace\n\
7471and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
7473static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007474unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475{
7476 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7477 register const Py_UNICODE *e;
7478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 /* Shortcut for single character strings */
7480 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 Py_UNICODE_ISSPACE(*p))
7482 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007484 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007485 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 e = p + PyUnicode_GET_SIZE(self);
7489 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 if (!Py_UNICODE_ISSPACE(*p))
7491 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007493 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494}
7495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007498\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007499Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007501
7502static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007503unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007504{
7505 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7506 register const Py_UNICODE *e;
7507
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007508 /* Shortcut for single character strings */
7509 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 Py_UNICODE_ISALPHA(*p))
7511 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007512
7513 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007514 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007516
7517 e = p + PyUnicode_GET_SIZE(self);
7518 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 if (!Py_UNICODE_ISALPHA(*p))
7520 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007521 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007522 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007523}
7524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007527\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007528Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007529and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007530
7531static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007532unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007533{
7534 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7535 register const Py_UNICODE *e;
7536
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007537 /* Shortcut for single character strings */
7538 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 Py_UNICODE_ISALNUM(*p))
7540 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541
7542 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007543 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007545
7546 e = p + PyUnicode_GET_SIZE(self);
7547 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 if (!Py_UNICODE_ISALNUM(*p))
7549 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007550 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007551 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007552}
7553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007557Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
7560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007561unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562{
7563 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7564 register const Py_UNICODE *e;
7565
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 /* Shortcut for single character strings */
7567 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 Py_UNICODE_ISDECIMAL(*p))
7569 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007571 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007572 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007574
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 e = p + PyUnicode_GET_SIZE(self);
7576 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 if (!Py_UNICODE_ISDECIMAL(*p))
7578 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007580 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007586Return True if all characters in S are digits\n\
7587and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007590unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591{
7592 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7593 register const Py_UNICODE *e;
7594
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 /* Shortcut for single character strings */
7596 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 Py_UNICODE_ISDIGIT(*p))
7598 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007600 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007601 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007603
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 e = p + PyUnicode_GET_SIZE(self);
7605 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 if (!Py_UNICODE_ISDIGIT(*p))
7607 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007609 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610}
7611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007615Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617
7618static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007619unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620{
7621 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7622 register const Py_UNICODE *e;
7623
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 /* Shortcut for single character strings */
7625 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 Py_UNICODE_ISNUMERIC(*p))
7627 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007629 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007630 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007632
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 e = p + PyUnicode_GET_SIZE(self);
7634 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 if (!Py_UNICODE_ISNUMERIC(*p))
7636 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007638 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639}
7640
Martin v. Löwis47383402007-08-15 07:32:56 +00007641int
7642PyUnicode_IsIdentifier(PyObject *self)
7643{
7644 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7645 register const Py_UNICODE *e;
7646
7647 /* Special case for empty strings */
7648 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007650
7651 /* PEP 3131 says that the first character must be in
7652 XID_Start and subsequent characters in XID_Continue,
7653 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007654 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007655 letters, digits, underscore). However, given the current
7656 definition of XID_Start and XID_Continue, it is sufficient
7657 to check just for these, except that _ must be allowed
7658 as starting an identifier. */
7659 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7660 return 0;
7661
7662 e = p + PyUnicode_GET_SIZE(self);
7663 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 if (!_PyUnicode_IsXidContinue(*p))
7665 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007666 }
7667 return 1;
7668}
7669
7670PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007672\n\
7673Return True if S is a valid identifier according\n\
7674to the language definition.");
7675
7676static PyObject*
7677unicode_isidentifier(PyObject *self)
7678{
7679 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7680}
7681
Georg Brandl559e5d72008-06-11 18:37:52 +00007682PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007684\n\
7685Return True if all characters in S are considered\n\
7686printable in repr() or S is empty, False otherwise.");
7687
7688static PyObject*
7689unicode_isprintable(PyObject *self)
7690{
7691 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7692 register const Py_UNICODE *e;
7693
7694 /* Shortcut for single character strings */
7695 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7696 Py_RETURN_TRUE;
7697 }
7698
7699 e = p + PyUnicode_GET_SIZE(self);
7700 for (; p < e; p++) {
7701 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7702 Py_RETURN_FALSE;
7703 }
7704 }
7705 Py_RETURN_TRUE;
7706}
7707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007709 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710\n\
7711Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007712iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713
7714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007715unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007717 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718}
7719
Martin v. Löwis18e16552006-02-15 17:27:45 +00007720static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721unicode_length(PyUnicodeObject *self)
7722{
7723 return self->length;
7724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007729Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007730done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732static PyObject *
7733unicode_ljust(PyUnicodeObject *self, PyObject *args)
7734{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007735 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007736 Py_UNICODE fillchar = ' ';
7737
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007738 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 return NULL;
7740
Tim Peters7a29bd52001-09-12 03:03:31 +00007741 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 Py_INCREF(self);
7743 return (PyObject*) self;
7744 }
7745
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007746 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747}
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007752Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
7754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007755unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 return fixup(self, fixlower);
7758}
7759
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007760#define LEFTSTRIP 0
7761#define RIGHTSTRIP 1
7762#define BOTHSTRIP 2
7763
7764/* Arrays indexed by above */
7765static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7766
7767#define STRIPNAME(i) (stripformat[i]+3)
7768
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769/* externally visible for str.strip(unicode) */
7770PyObject *
7771_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7774 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7775 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7776 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7777 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007778
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007780
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 i = 0;
7782 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7784 i++;
7785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007787
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 j = len;
7789 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 do {
7791 j--;
7792 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7793 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007795
Benjamin Peterson14339b62009-01-31 16:36:08 +00007796 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 Py_INCREF(self);
7798 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 }
7800 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007802}
7803
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007808 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7809 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007810
Benjamin Peterson14339b62009-01-31 16:36:08 +00007811 i = 0;
7812 if (striptype != RIGHTSTRIP) {
7813 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7814 i++;
7815 }
7816 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007817
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 j = len;
7819 if (striptype != LEFTSTRIP) {
7820 do {
7821 j--;
7822 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7823 j++;
7824 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007825
Benjamin Peterson14339b62009-01-31 16:36:08 +00007826 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7827 Py_INCREF(self);
7828 return (PyObject*)self;
7829 }
7830 else
7831 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832}
7833
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007834
7835static PyObject *
7836do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7837{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839
Benjamin Peterson14339b62009-01-31 16:36:08 +00007840 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7841 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007842
Benjamin Peterson14339b62009-01-31 16:36:08 +00007843 if (sep != NULL && sep != Py_None) {
7844 if (PyUnicode_Check(sep))
7845 return _PyUnicode_XStrip(self, striptype, sep);
7846 else {
7847 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 "%s arg must be None or str",
7849 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 return NULL;
7851 }
7852 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855}
7856
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007860\n\
7861Return a copy of the string S with leading and trailing\n\
7862whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007863If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007864
7865static PyObject *
7866unicode_strip(PyUnicodeObject *self, PyObject *args)
7867{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 if (PyTuple_GET_SIZE(args) == 0)
7869 return do_strip(self, BOTHSTRIP); /* Common case */
7870 else
7871 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007872}
7873
7874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007875PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007877\n\
7878Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007879If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007880
7881static PyObject *
7882unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7883{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 if (PyTuple_GET_SIZE(args) == 0)
7885 return do_strip(self, LEFTSTRIP); /* Common case */
7886 else
7887 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007888}
7889
7890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007893\n\
7894Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007895If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007896
7897static PyObject *
7898unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007900 if (PyTuple_GET_SIZE(args) == 0)
7901 return do_strip(self, RIGHTSTRIP); /* Common case */
7902 else
7903 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007904}
7905
7906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007908unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909{
7910 PyUnicodeObject *u;
7911 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007912 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007913 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914
Georg Brandl222de0f2009-04-12 12:01:50 +00007915 if (len < 1) {
7916 Py_INCREF(unicode_empty);
7917 return (PyObject *)unicode_empty;
7918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919
Tim Peters7a29bd52001-09-12 03:03:31 +00007920 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921 /* no repeat, return original string */
7922 Py_INCREF(str);
7923 return (PyObject*) str;
7924 }
Tim Peters8f422462000-09-09 06:13:41 +00007925
7926 /* ensure # of chars needed doesn't overflow int and # of bytes
7927 * needed doesn't overflow size_t
7928 */
7929 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007930 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007931 PyErr_SetString(PyExc_OverflowError,
7932 "repeated string is too long");
7933 return NULL;
7934 }
7935 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7936 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7937 PyErr_SetString(PyExc_OverflowError,
7938 "repeated string is too long");
7939 return NULL;
7940 }
7941 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 if (!u)
7943 return NULL;
7944
7945 p = u->str;
7946
Georg Brandl222de0f2009-04-12 12:01:50 +00007947 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007948 Py_UNICODE_FILL(p, str->str[0], len);
7949 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007950 Py_ssize_t done = str->length; /* number of characters copied this far */
7951 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007953 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954 Py_UNICODE_COPY(p+done, p, n);
7955 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 }
7958
7959 return (PyObject*) u;
7960}
7961
7962PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 PyObject *subobj,
7964 PyObject *replobj,
7965 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966{
7967 PyObject *self;
7968 PyObject *str1;
7969 PyObject *str2;
7970 PyObject *result;
7971
7972 self = PyUnicode_FromObject(obj);
7973 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 str1 = PyUnicode_FromObject(subobj);
7976 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 Py_DECREF(self);
7978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 }
7980 str2 = PyUnicode_FromObject(replobj);
7981 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_DECREF(self);
7983 Py_DECREF(str1);
7984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
Tim Petersced69f82003-09-16 20:30:58 +00007986 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 (PyUnicodeObject *)str1,
7988 (PyUnicodeObject *)str2,
7989 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 Py_DECREF(self);
7991 Py_DECREF(str1);
7992 Py_DECREF(str2);
7993 return result;
7994}
7995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007996PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007997 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998\n\
7999Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008000old replaced by new. If the optional argument count is\n\
8001given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002
8003static PyObject*
8004unicode_replace(PyUnicodeObject *self, PyObject *args)
8005{
8006 PyUnicodeObject *str1;
8007 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 PyObject *result;
8010
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 return NULL;
8013 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8014 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008017 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 Py_DECREF(str1);
8019 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
8022 result = replace(self, str1, str2, maxcount);
8023
8024 Py_DECREF(str1);
8025 Py_DECREF(str2);
8026 return result;
8027}
8028
8029static
8030PyObject *unicode_repr(PyObject *unicode)
8031{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008032 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008033 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008034 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8035 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8036
8037 /* XXX(nnorwitz): rather than over-allocating, it would be
8038 better to choose a different scheme. Perhaps scan the
8039 first N-chars of the string and allocate based on that size.
8040 */
8041 /* Initial allocation is based on the longest-possible unichr
8042 escape.
8043
8044 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8045 unichr, so in this case it's the longest unichr escape. In
8046 narrow (UTF-16) builds this is five chars per source unichr
8047 since there are two unichrs in the surrogate pair, so in narrow
8048 (UTF-16) builds it's not the longest unichr escape.
8049
8050 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8051 so in the narrow (UTF-16) build case it's the longest unichr
8052 escape.
8053 */
8054
Walter Dörwald1ab83302007-05-18 17:15:44 +00008055 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008057#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008059#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008061#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008063 if (repr == NULL)
8064 return NULL;
8065
Walter Dörwald1ab83302007-05-18 17:15:44 +00008066 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067
8068 /* Add quote */
8069 *p++ = (findchar(s, size, '\'') &&
8070 !findchar(s, size, '"')) ? '"' : '\'';
8071 while (size-- > 0) {
8072 Py_UNICODE ch = *s++;
8073
8074 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008075 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008076 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008077 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008078 continue;
8079 }
8080
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008082 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008083 *p++ = '\\';
8084 *p++ = 't';
8085 }
8086 else if (ch == '\n') {
8087 *p++ = '\\';
8088 *p++ = 'n';
8089 }
8090 else if (ch == '\r') {
8091 *p++ = '\\';
8092 *p++ = 'r';
8093 }
8094
8095 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008096 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008097 *p++ = '\\';
8098 *p++ = 'x';
8099 *p++ = hexdigits[(ch >> 4) & 0x000F];
8100 *p++ = hexdigits[ch & 0x000F];
8101 }
8102
Georg Brandl559e5d72008-06-11 18:37:52 +00008103 /* Copy ASCII characters as-is */
8104 else if (ch < 0x7F) {
8105 *p++ = ch;
8106 }
8107
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008109 else {
8110 Py_UCS4 ucs = ch;
8111
8112#ifndef Py_UNICODE_WIDE
8113 Py_UNICODE ch2 = 0;
8114 /* Get code point from surrogate pair */
8115 if (size > 0) {
8116 ch2 = *s;
8117 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008122 size--;
8123 }
8124 }
8125#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008127 (categories Z* and C* except ASCII space)
8128 */
8129 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8130 /* Map 8-bit characters to '\xhh' */
8131 if (ucs <= 0xff) {
8132 *p++ = '\\';
8133 *p++ = 'x';
8134 *p++ = hexdigits[(ch >> 4) & 0x000F];
8135 *p++ = hexdigits[ch & 0x000F];
8136 }
8137 /* Map 21-bit characters to '\U00xxxxxx' */
8138 else if (ucs >= 0x10000) {
8139 *p++ = '\\';
8140 *p++ = 'U';
8141 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8142 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8143 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8144 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8145 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8146 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8147 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8148 *p++ = hexdigits[ucs & 0x0000000F];
8149 }
8150 /* Map 16-bit characters to '\uxxxx' */
8151 else {
8152 *p++ = '\\';
8153 *p++ = 'u';
8154 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8155 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8156 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8157 *p++ = hexdigits[ucs & 0x000F];
8158 }
8159 }
8160 /* Copy characters as-is */
8161 else {
8162 *p++ = ch;
8163#ifndef Py_UNICODE_WIDE
8164 if (ucs >= 0x10000)
8165 *p++ = ch2;
8166#endif
8167 }
8168 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008169 }
8170 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008171 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008172
8173 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008174 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008175 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176}
8177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008178PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180\n\
8181Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008182such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183arguments start and end are interpreted as in slice notation.\n\
8184\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008185Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186
8187static PyObject *
8188unicode_rfind(PyUnicodeObject *self, PyObject *args)
8189{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008190 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008191 Py_ssize_t start;
8192 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008193 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
Christian Heimes9cd17752007-11-18 19:35:23 +00008195 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198 result = stringlib_rfind_slice(
8199 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8200 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8201 start, end
8202 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203
8204 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008205
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207}
8208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008209PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008212Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213
8214static PyObject *
8215unicode_rindex(PyUnicodeObject *self, PyObject *args)
8216{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008217 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008218 Py_ssize_t start;
8219 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008220 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221
Christian Heimes9cd17752007-11-18 19:35:23 +00008222 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224
Thomas Wouters477c8d52006-05-27 19:21:47 +00008225 result = stringlib_rfind_slice(
8226 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8227 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8228 start, end
8229 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
8231 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 if (result < 0) {
8234 PyErr_SetString(PyExc_ValueError, "substring not found");
8235 return NULL;
8236 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008237 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238}
8239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008240PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008243Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008244done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
8246static PyObject *
8247unicode_rjust(PyUnicodeObject *self, PyObject *args)
8248{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008249 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008250 Py_UNICODE fillchar = ' ';
8251
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008252 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 return NULL;
8254
Tim Peters7a29bd52001-09-12 03:03:31 +00008255 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 Py_INCREF(self);
8257 return (PyObject*) self;
8258 }
8259
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008260 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261}
8262
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 PyObject *sep,
8265 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266{
8267 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 s = PyUnicode_FromObject(s);
8270 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (sep != NULL) {
8273 sep = PyUnicode_FromObject(sep);
8274 if (sep == NULL) {
8275 Py_DECREF(s);
8276 return NULL;
8277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
8279
8280 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8281
8282 Py_DECREF(s);
8283 Py_XDECREF(sep);
8284 return result;
8285}
8286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008287PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289\n\
8290Return a list of the words in S, using sep as the\n\
8291delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008292splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008293whitespace string is a separator and empty strings are\n\
8294removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
8296static PyObject*
8297unicode_split(PyUnicodeObject *self, PyObject *args)
8298{
8299 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 return NULL;
8304
8305 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311}
8312
Thomas Wouters477c8d52006-05-27 19:21:47 +00008313PyObject *
8314PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8315{
8316 PyObject* str_obj;
8317 PyObject* sep_obj;
8318 PyObject* out;
8319
8320 str_obj = PyUnicode_FromObject(str_in);
8321 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008323 sep_obj = PyUnicode_FromObject(sep_in);
8324 if (!sep_obj) {
8325 Py_DECREF(str_obj);
8326 return NULL;
8327 }
8328
8329 out = stringlib_partition(
8330 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8331 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8332 );
8333
8334 Py_DECREF(sep_obj);
8335 Py_DECREF(str_obj);
8336
8337 return out;
8338}
8339
8340
8341PyObject *
8342PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8343{
8344 PyObject* str_obj;
8345 PyObject* sep_obj;
8346 PyObject* out;
8347
8348 str_obj = PyUnicode_FromObject(str_in);
8349 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008351 sep_obj = PyUnicode_FromObject(sep_in);
8352 if (!sep_obj) {
8353 Py_DECREF(str_obj);
8354 return NULL;
8355 }
8356
8357 out = stringlib_rpartition(
8358 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8359 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8360 );
8361
8362 Py_DECREF(sep_obj);
8363 Py_DECREF(str_obj);
8364
8365 return out;
8366}
8367
8368PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008370\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008371Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008372the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008373found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374
8375static PyObject*
8376unicode_partition(PyUnicodeObject *self, PyObject *separator)
8377{
8378 return PyUnicode_Partition((PyObject *)self, separator);
8379}
8380
8381PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008382 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008384Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008386separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387
8388static PyObject*
8389unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8390{
8391 return PyUnicode_RPartition((PyObject *)self, separator);
8392}
8393
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008394PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 PyObject *sep,
8396 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008397{
8398 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008399
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008400 s = PyUnicode_FromObject(s);
8401 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 if (sep != NULL) {
8404 sep = PyUnicode_FromObject(sep);
8405 if (sep == NULL) {
8406 Py_DECREF(s);
8407 return NULL;
8408 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008409 }
8410
8411 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8412
8413 Py_DECREF(s);
8414 Py_XDECREF(sep);
8415 return result;
8416}
8417
8418PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008420\n\
8421Return a list of the words in S, using sep as the\n\
8422delimiter string, starting at the end of the string and\n\
8423working to the front. If maxsplit is given, at most maxsplit\n\
8424splits are done. If sep is not specified, any whitespace string\n\
8425is a separator.");
8426
8427static PyObject*
8428unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8429{
8430 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008432
Martin v. Löwis18e16552006-02-15 17:27:45 +00008433 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008434 return NULL;
8435
8436 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008438 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008440 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008442}
8443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008444PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446\n\
8447Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008448Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008449is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450
8451static PyObject*
8452unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8453{
Guido van Rossum86662912000-04-11 15:38:46 +00008454 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
Guido van Rossum86662912000-04-11 15:38:46 +00008456 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 return NULL;
8458
Guido van Rossum86662912000-04-11 15:38:46 +00008459 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460}
8461
8462static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008463PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464{
Walter Dörwald346737f2007-05-31 10:44:43 +00008465 if (PyUnicode_CheckExact(self)) {
8466 Py_INCREF(self);
8467 return self;
8468 } else
8469 /* Subtype -- return genuine unicode string with the same value. */
8470 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8471 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472}
8473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008474PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476\n\
8477Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008478and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479
8480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008481unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 return fixup(self, fixswapcase);
8484}
8485
Georg Brandlceee0772007-11-27 23:48:05 +00008486PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008488\n\
8489Return a translation table usable for str.translate().\n\
8490If there is only one argument, it must be a dictionary mapping Unicode\n\
8491ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008492Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008493If there are two arguments, they must be strings of equal length, and\n\
8494in the resulting dictionary, each character in x will be mapped to the\n\
8495character at the same position in y. If there is a third argument, it\n\
8496must be a string, whose characters will be mapped to None in the result.");
8497
8498static PyObject*
8499unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8500{
8501 PyObject *x, *y = NULL, *z = NULL;
8502 PyObject *new = NULL, *key, *value;
8503 Py_ssize_t i = 0;
8504 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505
Georg Brandlceee0772007-11-27 23:48:05 +00008506 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8507 return NULL;
8508 new = PyDict_New();
8509 if (!new)
8510 return NULL;
8511 if (y != NULL) {
8512 /* x must be a string too, of equal length */
8513 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8514 if (!PyUnicode_Check(x)) {
8515 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8516 "be a string if there is a second argument");
8517 goto err;
8518 }
8519 if (PyUnicode_GET_SIZE(x) != ylen) {
8520 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8521 "arguments must have equal length");
8522 goto err;
8523 }
8524 /* create entries for translating chars in x to those in y */
8525 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008526 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8527 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008528 if (!key || !value)
8529 goto err;
8530 res = PyDict_SetItem(new, key, value);
8531 Py_DECREF(key);
8532 Py_DECREF(value);
8533 if (res < 0)
8534 goto err;
8535 }
8536 /* create entries for deleting chars in z */
8537 if (z != NULL) {
8538 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008539 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008540 if (!key)
8541 goto err;
8542 res = PyDict_SetItem(new, key, Py_None);
8543 Py_DECREF(key);
8544 if (res < 0)
8545 goto err;
8546 }
8547 }
8548 } else {
8549 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008550 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008551 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8552 "to maketrans it must be a dict");
8553 goto err;
8554 }
8555 /* copy entries into the new dict, converting string keys to int keys */
8556 while (PyDict_Next(x, &i, &key, &value)) {
8557 if (PyUnicode_Check(key)) {
8558 /* convert string keys to integer keys */
8559 PyObject *newkey;
8560 if (PyUnicode_GET_SIZE(key) != 1) {
8561 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8562 "table must be of length 1");
8563 goto err;
8564 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008565 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008566 if (!newkey)
8567 goto err;
8568 res = PyDict_SetItem(new, newkey, value);
8569 Py_DECREF(newkey);
8570 if (res < 0)
8571 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008572 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008573 /* just keep integer keys */
8574 if (PyDict_SetItem(new, key, value) < 0)
8575 goto err;
8576 } else {
8577 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8578 "be strings or integers");
8579 goto err;
8580 }
8581 }
8582 }
8583 return new;
8584 err:
8585 Py_DECREF(new);
8586 return NULL;
8587}
8588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008589PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591\n\
8592Return a copy of the string S, where all characters have been mapped\n\
8593through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008594Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008595Unmapped characters are left untouched. Characters mapped to None\n\
8596are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597
8598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008599unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600{
Georg Brandlceee0772007-11-27 23:48:05 +00008601 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602}
8603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008604PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008607Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
8609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008610unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 return fixup(self, fixupper);
8613}
8614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008615PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008618Pad a numeric string S with zeros on the left, to fill a field\n\
8619of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620
8621static PyObject *
8622unicode_zfill(PyUnicodeObject *self, PyObject *args)
8623{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008624 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 PyUnicodeObject *u;
8626
Martin v. Löwis18e16552006-02-15 17:27:45 +00008627 Py_ssize_t width;
8628 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 return NULL;
8630
8631 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008632 if (PyUnicode_CheckExact(self)) {
8633 Py_INCREF(self);
8634 return (PyObject*) self;
8635 }
8636 else
8637 return PyUnicode_FromUnicode(
8638 PyUnicode_AS_UNICODE(self),
8639 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 }
8642
8643 fill = width - self->length;
8644
8645 u = pad(self, fill, 0, '0');
8646
Walter Dörwald068325e2002-04-15 13:36:47 +00008647 if (u == NULL)
8648 return NULL;
8649
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 if (u->str[fill] == '+' || u->str[fill] == '-') {
8651 /* move sign to beginning of string */
8652 u->str[0] = u->str[fill];
8653 u->str[fill] = '0';
8654 }
8655
8656 return (PyObject*) u;
8657}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
8659#if 0
8660static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008661unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Christian Heimes2202f872008-02-06 14:31:34 +00008663 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664}
8665#endif
8666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008667PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008670Return True if S starts with the specified prefix, False otherwise.\n\
8671With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008672With optional end, stop comparing S at that position.\n\
8673prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
8675static PyObject *
8676unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008679 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008681 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008682 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008685 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8687 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008688 if (PyTuple_Check(subobj)) {
8689 Py_ssize_t i;
8690 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8691 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008693 if (substring == NULL)
8694 return NULL;
8695 result = tailmatch(self, substring, start, end, -1);
8696 Py_DECREF(substring);
8697 if (result) {
8698 Py_RETURN_TRUE;
8699 }
8700 }
8701 /* nothing matched */
8702 Py_RETURN_FALSE;
8703 }
8704 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008707 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008709 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710}
8711
8712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008713PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008716Return True if S ends with the specified suffix, False otherwise.\n\
8717With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008718With optional end, stop comparing S at that position.\n\
8719suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720
8721static PyObject *
8722unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008725 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008727 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008728 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008729 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008731 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8733 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008734 if (PyTuple_Check(subobj)) {
8735 Py_ssize_t i;
8736 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8737 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008739 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008741 result = tailmatch(self, substring, start, end, +1);
8742 Py_DECREF(substring);
8743 if (result) {
8744 Py_RETURN_TRUE;
8745 }
8746 }
8747 Py_RETURN_FALSE;
8748 }
8749 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008753 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008755 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756}
8757
Eric Smith8c663262007-08-25 02:26:07 +00008758#include "stringlib/string_format.h"
8759
8760PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008762\n\
8763");
8764
Eric Smith4a7d76d2008-05-30 18:10:19 +00008765static PyObject *
8766unicode__format__(PyObject* self, PyObject* args)
8767{
8768 PyObject *format_spec;
8769
8770 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8771 return NULL;
8772
8773 return _PyUnicode_FormatAdvanced(self,
8774 PyUnicode_AS_UNICODE(format_spec),
8775 PyUnicode_GET_SIZE(format_spec));
8776}
8777
Eric Smith8c663262007-08-25 02:26:07 +00008778PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008780\n\
8781");
8782
8783static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008784unicode__sizeof__(PyUnicodeObject *v)
8785{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008786 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8787 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008788}
8789
8790PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008792
8793static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008794unicode_getnewargs(PyUnicodeObject *v)
8795{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008796 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008797}
8798
8799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800static PyMethodDef unicode_methods[] = {
8801
8802 /* Order is according to common usage: often used methods should
8803 appear first, since lookup is done sequentially. */
8804
Benjamin Peterson308d6372009-09-18 21:42:35 +00008805 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008806 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8807 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008808 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008809 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8810 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8811 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8812 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8813 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8814 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8815 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008816 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008817 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8818 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8819 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008820 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008821 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8822 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8823 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008824 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008825 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008826 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008827 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008828 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8829 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8830 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8831 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8832 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8833 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8834 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8835 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8836 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8837 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8838 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8839 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8840 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8841 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008842 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008843 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008844 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008845 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008846 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008847 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8848 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008849 {"maketrans", (PyCFunction) unicode_maketrans,
8850 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008851 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008852#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008853 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854#endif
8855
8856#if 0
8857 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008858 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859#endif
8860
Benjamin Peterson14339b62009-01-31 16:36:08 +00008861 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 {NULL, NULL}
8863};
8864
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008865static PyObject *
8866unicode_mod(PyObject *v, PyObject *w)
8867{
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 if (!PyUnicode_Check(v)) {
8869 Py_INCREF(Py_NotImplemented);
8870 return Py_NotImplemented;
8871 }
8872 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008873}
8874
8875static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008876 0, /*nb_add*/
8877 0, /*nb_subtract*/
8878 0, /*nb_multiply*/
8879 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008880};
8881
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 (lenfunc) unicode_length, /* sq_length */
8884 PyUnicode_Concat, /* sq_concat */
8885 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8886 (ssizeargfunc) unicode_getitem, /* sq_item */
8887 0, /* sq_slice */
8888 0, /* sq_ass_item */
8889 0, /* sq_ass_slice */
8890 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891};
8892
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008893static PyObject*
8894unicode_subscript(PyUnicodeObject* self, PyObject* item)
8895{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008896 if (PyIndex_Check(item)) {
8897 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008898 if (i == -1 && PyErr_Occurred())
8899 return NULL;
8900 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008901 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008902 return unicode_getitem(self, i);
8903 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008904 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008905 Py_UNICODE* source_buf;
8906 Py_UNICODE* result_buf;
8907 PyObject* result;
8908
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008909 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008911 return NULL;
8912 }
8913
8914 if (slicelength <= 0) {
8915 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008916 } else if (start == 0 && step == 1 && slicelength == self->length &&
8917 PyUnicode_CheckExact(self)) {
8918 Py_INCREF(self);
8919 return (PyObject *)self;
8920 } else if (step == 1) {
8921 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008922 } else {
8923 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008924 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8925 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008926
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 if (result_buf == NULL)
8928 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008929
8930 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8931 result_buf[i] = source_buf[cur];
8932 }
Tim Petersced69f82003-09-16 20:30:58 +00008933
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008934 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008935 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008936 return result;
8937 }
8938 } else {
8939 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8940 return NULL;
8941 }
8942}
8943
8944static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008945 (lenfunc)unicode_length, /* mp_length */
8946 (binaryfunc)unicode_subscript, /* mp_subscript */
8947 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008948};
8949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951/* Helpers for PyUnicode_Format() */
8952
8953static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008954getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008956 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 (*p_argidx)++;
8959 if (arglen < 0)
8960 return args;
8961 else
8962 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
8964 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 return NULL;
8967}
8968
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008969/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008971static PyObject *
8972formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008974 char *p;
8975 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008977
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 x = PyFloat_AsDouble(v);
8979 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008980 return NULL;
8981
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008984
Eric Smith0923d1d2009-04-16 20:16:10 +00008985 p = PyOS_double_to_string(x, type, prec,
8986 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008987 if (p == NULL)
8988 return NULL;
8989 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008990 PyMem_Free(p);
8991 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992}
8993
Tim Peters38fd5b62000-09-21 05:43:11 +00008994static PyObject*
8995formatlong(PyObject *val, int flags, int prec, int type)
8996{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008997 char *buf;
8998 int len;
8999 PyObject *str; /* temporary string object. */
9000 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009001
Benjamin Peterson14339b62009-01-31 16:36:08 +00009002 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9003 if (!str)
9004 return NULL;
9005 result = PyUnicode_FromStringAndSize(buf, len);
9006 Py_DECREF(str);
9007 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009008}
9009
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010static int
9011formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009012 size_t buflen,
9013 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009015 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009016 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 if (PyUnicode_GET_SIZE(v) == 1) {
9018 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9019 buf[1] = '\0';
9020 return 1;
9021 }
9022#ifndef Py_UNICODE_WIDE
9023 if (PyUnicode_GET_SIZE(v) == 2) {
9024 /* Decode a valid surrogate pair */
9025 int c0 = PyUnicode_AS_UNICODE(v)[0];
9026 int c1 = PyUnicode_AS_UNICODE(v)[1];
9027 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9028 0xDC00 <= c1 && c1 <= 0xDFFF) {
9029 buf[0] = c0;
9030 buf[1] = c1;
9031 buf[2] = '\0';
9032 return 2;
9033 }
9034 }
9035#endif
9036 goto onError;
9037 }
9038 else {
9039 /* Integer input truncated to a character */
9040 long x;
9041 x = PyLong_AsLong(v);
9042 if (x == -1 && PyErr_Occurred())
9043 goto onError;
9044
9045 if (x < 0 || x > 0x10ffff) {
9046 PyErr_SetString(PyExc_OverflowError,
9047 "%c arg not in range(0x110000)");
9048 return -1;
9049 }
9050
9051#ifndef Py_UNICODE_WIDE
9052 if (x > 0xffff) {
9053 x -= 0x10000;
9054 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9055 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9056 return 2;
9057 }
9058#endif
9059 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009060 buf[1] = '\0';
9061 return 1;
9062 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009063
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009065 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009067 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068}
9069
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009070/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009071 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009072*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009073#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009074
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077{
9078 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009079 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080 int args_owned = 0;
9081 PyUnicodeObject *result = NULL;
9082 PyObject *dict = NULL;
9083 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009084
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 PyErr_BadInternalCall();
9087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 }
9089 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009090 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 fmt = PyUnicode_AS_UNICODE(uformat);
9093 fmtcnt = PyUnicode_GET_SIZE(uformat);
9094
9095 reslen = rescnt = fmtcnt + 100;
9096 result = _PyUnicode_New(reslen);
9097 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 res = PyUnicode_AS_UNICODE(result);
9100
9101 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 arglen = PyTuple_Size(args);
9103 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 }
9105 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 arglen = -1;
9107 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009109 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009110 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112
9113 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 if (*fmt != '%') {
9115 if (--rescnt < 0) {
9116 rescnt = fmtcnt + 100;
9117 reslen += rescnt;
9118 if (_PyUnicode_Resize(&result, reslen) < 0)
9119 goto onError;
9120 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9121 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009124 }
9125 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 /* Got a format specifier */
9127 int flags = 0;
9128 Py_ssize_t width = -1;
9129 int prec = -1;
9130 Py_UNICODE c = '\0';
9131 Py_UNICODE fill;
9132 int isnumok;
9133 PyObject *v = NULL;
9134 PyObject *temp = NULL;
9135 Py_UNICODE *pbuf;
9136 Py_UNICODE sign;
9137 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009138 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 fmt++;
9141 if (*fmt == '(') {
9142 Py_UNICODE *keystart;
9143 Py_ssize_t keylen;
9144 PyObject *key;
9145 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009146
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 if (dict == NULL) {
9148 PyErr_SetString(PyExc_TypeError,
9149 "format requires a mapping");
9150 goto onError;
9151 }
9152 ++fmt;
9153 --fmtcnt;
9154 keystart = fmt;
9155 /* Skip over balanced parentheses */
9156 while (pcount > 0 && --fmtcnt >= 0) {
9157 if (*fmt == ')')
9158 --pcount;
9159 else if (*fmt == '(')
9160 ++pcount;
9161 fmt++;
9162 }
9163 keylen = fmt - keystart - 1;
9164 if (fmtcnt < 0 || pcount > 0) {
9165 PyErr_SetString(PyExc_ValueError,
9166 "incomplete format key");
9167 goto onError;
9168 }
9169#if 0
9170 /* keys are converted to strings using UTF-8 and
9171 then looked up since Python uses strings to hold
9172 variables names etc. in its namespaces and we
9173 wouldn't want to break common idioms. */
9174 key = PyUnicode_EncodeUTF8(keystart,
9175 keylen,
9176 NULL);
9177#else
9178 key = PyUnicode_FromUnicode(keystart, keylen);
9179#endif
9180 if (key == NULL)
9181 goto onError;
9182 if (args_owned) {
9183 Py_DECREF(args);
9184 args_owned = 0;
9185 }
9186 args = PyObject_GetItem(dict, key);
9187 Py_DECREF(key);
9188 if (args == NULL) {
9189 goto onError;
9190 }
9191 args_owned = 1;
9192 arglen = -1;
9193 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 while (--fmtcnt >= 0) {
9196 switch (c = *fmt++) {
9197 case '-': flags |= F_LJUST; continue;
9198 case '+': flags |= F_SIGN; continue;
9199 case ' ': flags |= F_BLANK; continue;
9200 case '#': flags |= F_ALT; continue;
9201 case '0': flags |= F_ZERO; continue;
9202 }
9203 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 if (c == '*') {
9206 v = getnextarg(args, arglen, &argidx);
9207 if (v == NULL)
9208 goto onError;
9209 if (!PyLong_Check(v)) {
9210 PyErr_SetString(PyExc_TypeError,
9211 "* wants int");
9212 goto onError;
9213 }
9214 width = PyLong_AsLong(v);
9215 if (width == -1 && PyErr_Occurred())
9216 goto onError;
9217 if (width < 0) {
9218 flags |= F_LJUST;
9219 width = -width;
9220 }
9221 if (--fmtcnt >= 0)
9222 c = *fmt++;
9223 }
9224 else if (c >= '0' && c <= '9') {
9225 width = c - '0';
9226 while (--fmtcnt >= 0) {
9227 c = *fmt++;
9228 if (c < '0' || c > '9')
9229 break;
9230 if ((width*10) / 10 != width) {
9231 PyErr_SetString(PyExc_ValueError,
9232 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009233 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 }
9235 width = width*10 + (c - '0');
9236 }
9237 }
9238 if (c == '.') {
9239 prec = 0;
9240 if (--fmtcnt >= 0)
9241 c = *fmt++;
9242 if (c == '*') {
9243 v = getnextarg(args, arglen, &argidx);
9244 if (v == NULL)
9245 goto onError;
9246 if (!PyLong_Check(v)) {
9247 PyErr_SetString(PyExc_TypeError,
9248 "* wants int");
9249 goto onError;
9250 }
9251 prec = PyLong_AsLong(v);
9252 if (prec == -1 && PyErr_Occurred())
9253 goto onError;
9254 if (prec < 0)
9255 prec = 0;
9256 if (--fmtcnt >= 0)
9257 c = *fmt++;
9258 }
9259 else if (c >= '0' && c <= '9') {
9260 prec = c - '0';
9261 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009262 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 if (c < '0' || c > '9')
9264 break;
9265 if ((prec*10) / 10 != prec) {
9266 PyErr_SetString(PyExc_ValueError,
9267 "prec too big");
9268 goto onError;
9269 }
9270 prec = prec*10 + (c - '0');
9271 }
9272 }
9273 } /* prec */
9274 if (fmtcnt >= 0) {
9275 if (c == 'h' || c == 'l' || c == 'L') {
9276 if (--fmtcnt >= 0)
9277 c = *fmt++;
9278 }
9279 }
9280 if (fmtcnt < 0) {
9281 PyErr_SetString(PyExc_ValueError,
9282 "incomplete format");
9283 goto onError;
9284 }
9285 if (c != '%') {
9286 v = getnextarg(args, arglen, &argidx);
9287 if (v == NULL)
9288 goto onError;
9289 }
9290 sign = 0;
9291 fill = ' ';
9292 switch (c) {
9293
9294 case '%':
9295 pbuf = formatbuf;
9296 /* presume that buffer length is at least 1 */
9297 pbuf[0] = '%';
9298 len = 1;
9299 break;
9300
9301 case 's':
9302 case 'r':
9303 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009304 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 temp = v;
9306 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009307 }
9308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 if (c == 's')
9310 temp = PyObject_Str(v);
9311 else if (c == 'r')
9312 temp = PyObject_Repr(v);
9313 else
9314 temp = PyObject_ASCII(v);
9315 if (temp == NULL)
9316 goto onError;
9317 if (PyUnicode_Check(temp))
9318 /* nothing to do */;
9319 else {
9320 Py_DECREF(temp);
9321 PyErr_SetString(PyExc_TypeError,
9322 "%s argument has non-string str()");
9323 goto onError;
9324 }
9325 }
9326 pbuf = PyUnicode_AS_UNICODE(temp);
9327 len = PyUnicode_GET_SIZE(temp);
9328 if (prec >= 0 && len > prec)
9329 len = prec;
9330 break;
9331
9332 case 'i':
9333 case 'd':
9334 case 'u':
9335 case 'o':
9336 case 'x':
9337 case 'X':
9338 if (c == 'i')
9339 c = 'd';
9340 isnumok = 0;
9341 if (PyNumber_Check(v)) {
9342 PyObject *iobj=NULL;
9343
9344 if (PyLong_Check(v)) {
9345 iobj = v;
9346 Py_INCREF(iobj);
9347 }
9348 else {
9349 iobj = PyNumber_Long(v);
9350 }
9351 if (iobj!=NULL) {
9352 if (PyLong_Check(iobj)) {
9353 isnumok = 1;
9354 temp = formatlong(iobj, flags, prec, c);
9355 Py_DECREF(iobj);
9356 if (!temp)
9357 goto onError;
9358 pbuf = PyUnicode_AS_UNICODE(temp);
9359 len = PyUnicode_GET_SIZE(temp);
9360 sign = 1;
9361 }
9362 else {
9363 Py_DECREF(iobj);
9364 }
9365 }
9366 }
9367 if (!isnumok) {
9368 PyErr_Format(PyExc_TypeError,
9369 "%%%c format: a number is required, "
9370 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9371 goto onError;
9372 }
9373 if (flags & F_ZERO)
9374 fill = '0';
9375 break;
9376
9377 case 'e':
9378 case 'E':
9379 case 'f':
9380 case 'F':
9381 case 'g':
9382 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009383 temp = formatfloat(v, flags, prec, c);
9384 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009386 pbuf = PyUnicode_AS_UNICODE(temp);
9387 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 sign = 1;
9389 if (flags & F_ZERO)
9390 fill = '0';
9391 break;
9392
9393 case 'c':
9394 pbuf = formatbuf;
9395 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9396 if (len < 0)
9397 goto onError;
9398 break;
9399
9400 default:
9401 PyErr_Format(PyExc_ValueError,
9402 "unsupported format character '%c' (0x%x) "
9403 "at index %zd",
9404 (31<=c && c<=126) ? (char)c : '?',
9405 (int)c,
9406 (Py_ssize_t)(fmt - 1 -
9407 PyUnicode_AS_UNICODE(uformat)));
9408 goto onError;
9409 }
9410 if (sign) {
9411 if (*pbuf == '-' || *pbuf == '+') {
9412 sign = *pbuf++;
9413 len--;
9414 }
9415 else if (flags & F_SIGN)
9416 sign = '+';
9417 else if (flags & F_BLANK)
9418 sign = ' ';
9419 else
9420 sign = 0;
9421 }
9422 if (width < len)
9423 width = len;
9424 if (rescnt - (sign != 0) < width) {
9425 reslen -= rescnt;
9426 rescnt = width + fmtcnt + 100;
9427 reslen += rescnt;
9428 if (reslen < 0) {
9429 Py_XDECREF(temp);
9430 PyErr_NoMemory();
9431 goto onError;
9432 }
9433 if (_PyUnicode_Resize(&result, reslen) < 0) {
9434 Py_XDECREF(temp);
9435 goto onError;
9436 }
9437 res = PyUnicode_AS_UNICODE(result)
9438 + reslen - rescnt;
9439 }
9440 if (sign) {
9441 if (fill != ' ')
9442 *res++ = sign;
9443 rescnt--;
9444 if (width > len)
9445 width--;
9446 }
9447 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9448 assert(pbuf[0] == '0');
9449 assert(pbuf[1] == c);
9450 if (fill != ' ') {
9451 *res++ = *pbuf++;
9452 *res++ = *pbuf++;
9453 }
9454 rescnt -= 2;
9455 width -= 2;
9456 if (width < 0)
9457 width = 0;
9458 len -= 2;
9459 }
9460 if (width > len && !(flags & F_LJUST)) {
9461 do {
9462 --rescnt;
9463 *res++ = fill;
9464 } while (--width > len);
9465 }
9466 if (fill == ' ') {
9467 if (sign)
9468 *res++ = sign;
9469 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9470 assert(pbuf[0] == '0');
9471 assert(pbuf[1] == c);
9472 *res++ = *pbuf++;
9473 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009474 }
9475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 Py_UNICODE_COPY(res, pbuf, len);
9477 res += len;
9478 rescnt -= len;
9479 while (--width >= len) {
9480 --rescnt;
9481 *res++ = ' ';
9482 }
9483 if (dict && (argidx < arglen) && c != '%') {
9484 PyErr_SetString(PyExc_TypeError,
9485 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009486 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 goto onError;
9488 }
9489 Py_XDECREF(temp);
9490 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 } /* until end */
9492 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 PyErr_SetString(PyExc_TypeError,
9494 "not all arguments converted during string formatting");
9495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
9497
Thomas Woutersa96affe2006-03-12 00:29:36 +00009498 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
9503 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 return (PyObject *)result;
9505
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 Py_XDECREF(result);
9508 Py_DECREF(uformat);
9509 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512 return NULL;
9513}
9514
Jeremy Hylton938ace62002-07-17 16:30:39 +00009515static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009516unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9517
Tim Peters6d6c1a32001-08-02 04:15:00 +00009518static PyObject *
9519unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9520{
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009522 static char *kwlist[] = {"object", "encoding", "errors", 0};
9523 char *encoding = NULL;
9524 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009525
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 if (type != &PyUnicode_Type)
9527 return unicode_subtype_new(type, args, kwds);
9528 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530 return NULL;
9531 if (x == NULL)
9532 return (PyObject *)_PyUnicode_New(0);
9533 if (encoding == NULL && errors == NULL)
9534 return PyObject_Str(x);
9535 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009537}
9538
Guido van Rossume023fe02001-08-30 03:12:59 +00009539static PyObject *
9540unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9541{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009542 PyUnicodeObject *tmp, *pnew;
9543 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009544
Benjamin Peterson14339b62009-01-31 16:36:08 +00009545 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9546 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9547 if (tmp == NULL)
9548 return NULL;
9549 assert(PyUnicode_Check(tmp));
9550 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9551 if (pnew == NULL) {
9552 Py_DECREF(tmp);
9553 return NULL;
9554 }
9555 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9556 if (pnew->str == NULL) {
9557 _Py_ForgetReference((PyObject *)pnew);
9558 PyObject_Del(pnew);
9559 Py_DECREF(tmp);
9560 return PyErr_NoMemory();
9561 }
9562 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9563 pnew->length = n;
9564 pnew->hash = tmp->hash;
9565 Py_DECREF(tmp);
9566 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009567}
9568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009571\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009572Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009573encoding defaults to the current default string encoding.\n\
9574errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009575
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009576static PyObject *unicode_iter(PyObject *seq);
9577
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009579 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 "str", /* tp_name */
9581 sizeof(PyUnicodeObject), /* tp_size */
9582 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 (destructor)unicode_dealloc, /* tp_dealloc */
9585 0, /* tp_print */
9586 0, /* tp_getattr */
9587 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009588 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 unicode_repr, /* tp_repr */
9590 &unicode_as_number, /* tp_as_number */
9591 &unicode_as_sequence, /* tp_as_sequence */
9592 &unicode_as_mapping, /* tp_as_mapping */
9593 (hashfunc) unicode_hash, /* tp_hash*/
9594 0, /* tp_call*/
9595 (reprfunc) unicode_str, /* tp_str */
9596 PyObject_GenericGetAttr, /* tp_getattro */
9597 0, /* tp_setattro */
9598 0, /* tp_as_buffer */
9599 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009600 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009601 unicode_doc, /* tp_doc */
9602 0, /* tp_traverse */
9603 0, /* tp_clear */
9604 PyUnicode_RichCompare, /* tp_richcompare */
9605 0, /* tp_weaklistoffset */
9606 unicode_iter, /* tp_iter */
9607 0, /* tp_iternext */
9608 unicode_methods, /* tp_methods */
9609 0, /* tp_members */
9610 0, /* tp_getset */
9611 &PyBaseObject_Type, /* tp_base */
9612 0, /* tp_dict */
9613 0, /* tp_descr_get */
9614 0, /* tp_descr_set */
9615 0, /* tp_dictoffset */
9616 0, /* tp_init */
9617 0, /* tp_alloc */
9618 unicode_new, /* tp_new */
9619 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620};
9621
9622/* Initialize the Unicode implementation */
9623
Thomas Wouters78890102000-07-22 19:25:51 +00009624void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009626 int i;
9627
Thomas Wouters477c8d52006-05-27 19:21:47 +00009628 /* XXX - move this array to unicodectype.c ? */
9629 Py_UNICODE linebreak[] = {
9630 0x000A, /* LINE FEED */
9631 0x000D, /* CARRIAGE RETURN */
9632 0x001C, /* FILE SEPARATOR */
9633 0x001D, /* GROUP SEPARATOR */
9634 0x001E, /* RECORD SEPARATOR */
9635 0x0085, /* NEXT LINE */
9636 0x2028, /* LINE SEPARATOR */
9637 0x2029, /* PARAGRAPH SEPARATOR */
9638 };
9639
Fred Drakee4315f52000-05-09 19:53:39 +00009640 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009641 free_list = NULL;
9642 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009644 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009646
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009647 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009649 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009651
9652 /* initialize the linebreak bloom filter */
9653 bloom_linebreak = make_bloom_mask(
9654 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9655 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009656
9657 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
9660/* Finalize the Unicode implementation */
9661
Christian Heimesa156e092008-02-16 07:38:31 +00009662int
9663PyUnicode_ClearFreeList(void)
9664{
9665 int freelist_size = numfree;
9666 PyUnicodeObject *u;
9667
9668 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 PyUnicodeObject *v = u;
9670 u = *(PyUnicodeObject **)u;
9671 if (v->str)
9672 PyObject_DEL(v->str);
9673 Py_XDECREF(v->defenc);
9674 PyObject_Del(v);
9675 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009676 }
9677 free_list = NULL;
9678 assert(numfree == 0);
9679 return freelist_size;
9680}
9681
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682void
Thomas Wouters78890102000-07-22 19:25:51 +00009683_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009685 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009687 Py_XDECREF(unicode_empty);
9688 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009689
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009690 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 if (unicode_latin1[i]) {
9692 Py_DECREF(unicode_latin1[i]);
9693 unicode_latin1[i] = NULL;
9694 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009695 }
Christian Heimesa156e092008-02-16 07:38:31 +00009696 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009698
Walter Dörwald16807132007-05-25 13:52:07 +00009699void
9700PyUnicode_InternInPlace(PyObject **p)
9701{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009702 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9703 PyObject *t;
9704 if (s == NULL || !PyUnicode_Check(s))
9705 Py_FatalError(
9706 "PyUnicode_InternInPlace: unicode strings only please!");
9707 /* If it's a subclass, we don't really know what putting
9708 it in the interned dict might do. */
9709 if (!PyUnicode_CheckExact(s))
9710 return;
9711 if (PyUnicode_CHECK_INTERNED(s))
9712 return;
9713 if (interned == NULL) {
9714 interned = PyDict_New();
9715 if (interned == NULL) {
9716 PyErr_Clear(); /* Don't leave an exception */
9717 return;
9718 }
9719 }
9720 /* It might be that the GetItem call fails even
9721 though the key is present in the dictionary,
9722 namely when this happens during a stack overflow. */
9723 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009726
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 if (t) {
9728 Py_INCREF(t);
9729 Py_DECREF(*p);
9730 *p = t;
9731 return;
9732 }
Walter Dörwald16807132007-05-25 13:52:07 +00009733
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 PyThreadState_GET()->recursion_critical = 1;
9735 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9736 PyErr_Clear();
9737 PyThreadState_GET()->recursion_critical = 0;
9738 return;
9739 }
9740 PyThreadState_GET()->recursion_critical = 0;
9741 /* The two references in interned are not counted by refcnt.
9742 The deallocator will take care of this */
9743 Py_REFCNT(s) -= 2;
9744 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009745}
9746
9747void
9748PyUnicode_InternImmortal(PyObject **p)
9749{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009750 PyUnicode_InternInPlace(p);
9751 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9752 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9753 Py_INCREF(*p);
9754 }
Walter Dörwald16807132007-05-25 13:52:07 +00009755}
9756
9757PyObject *
9758PyUnicode_InternFromString(const char *cp)
9759{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009760 PyObject *s = PyUnicode_FromString(cp);
9761 if (s == NULL)
9762 return NULL;
9763 PyUnicode_InternInPlace(&s);
9764 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009765}
9766
9767void _Py_ReleaseInternedUnicodeStrings(void)
9768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009769 PyObject *keys;
9770 PyUnicodeObject *s;
9771 Py_ssize_t i, n;
9772 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009773
Benjamin Peterson14339b62009-01-31 16:36:08 +00009774 if (interned == NULL || !PyDict_Check(interned))
9775 return;
9776 keys = PyDict_Keys(interned);
9777 if (keys == NULL || !PyList_Check(keys)) {
9778 PyErr_Clear();
9779 return;
9780 }
Walter Dörwald16807132007-05-25 13:52:07 +00009781
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9783 detector, interned unicode strings are not forcibly deallocated;
9784 rather, we give them their stolen references back, and then clear
9785 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009786
Benjamin Peterson14339b62009-01-31 16:36:08 +00009787 n = PyList_GET_SIZE(keys);
9788 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009790 for (i = 0; i < n; i++) {
9791 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9792 switch (s->state) {
9793 case SSTATE_NOT_INTERNED:
9794 /* XXX Shouldn't happen */
9795 break;
9796 case SSTATE_INTERNED_IMMORTAL:
9797 Py_REFCNT(s) += 1;
9798 immortal_size += s->length;
9799 break;
9800 case SSTATE_INTERNED_MORTAL:
9801 Py_REFCNT(s) += 2;
9802 mortal_size += s->length;
9803 break;
9804 default:
9805 Py_FatalError("Inconsistent interned string state.");
9806 }
9807 s->state = SSTATE_NOT_INTERNED;
9808 }
9809 fprintf(stderr, "total size of all interned strings: "
9810 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9811 "mortal/immortal\n", mortal_size, immortal_size);
9812 Py_DECREF(keys);
9813 PyDict_Clear(interned);
9814 Py_DECREF(interned);
9815 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009816}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009817
9818
9819/********************* Unicode Iterator **************************/
9820
9821typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009822 PyObject_HEAD
9823 Py_ssize_t it_index;
9824 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009825} unicodeiterobject;
9826
9827static void
9828unicodeiter_dealloc(unicodeiterobject *it)
9829{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 _PyObject_GC_UNTRACK(it);
9831 Py_XDECREF(it->it_seq);
9832 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009833}
9834
9835static int
9836unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9837{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 Py_VISIT(it->it_seq);
9839 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009840}
9841
9842static PyObject *
9843unicodeiter_next(unicodeiterobject *it)
9844{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009845 PyUnicodeObject *seq;
9846 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009847
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 assert(it != NULL);
9849 seq = it->it_seq;
9850 if (seq == NULL)
9851 return NULL;
9852 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009853
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9855 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009856 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 if (item != NULL)
9858 ++it->it_index;
9859 return item;
9860 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009861
Benjamin Peterson14339b62009-01-31 16:36:08 +00009862 Py_DECREF(seq);
9863 it->it_seq = NULL;
9864 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009865}
9866
9867static PyObject *
9868unicodeiter_len(unicodeiterobject *it)
9869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009870 Py_ssize_t len = 0;
9871 if (it->it_seq)
9872 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9873 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009874}
9875
9876PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9877
9878static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009879 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009880 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009882};
9883
9884PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9886 "str_iterator", /* tp_name */
9887 sizeof(unicodeiterobject), /* tp_basicsize */
9888 0, /* tp_itemsize */
9889 /* methods */
9890 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9891 0, /* tp_print */
9892 0, /* tp_getattr */
9893 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009894 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 0, /* tp_repr */
9896 0, /* tp_as_number */
9897 0, /* tp_as_sequence */
9898 0, /* tp_as_mapping */
9899 0, /* tp_hash */
9900 0, /* tp_call */
9901 0, /* tp_str */
9902 PyObject_GenericGetAttr, /* tp_getattro */
9903 0, /* tp_setattro */
9904 0, /* tp_as_buffer */
9905 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9906 0, /* tp_doc */
9907 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9908 0, /* tp_clear */
9909 0, /* tp_richcompare */
9910 0, /* tp_weaklistoffset */
9911 PyObject_SelfIter, /* tp_iter */
9912 (iternextfunc)unicodeiter_next, /* tp_iternext */
9913 unicodeiter_methods, /* tp_methods */
9914 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009915};
9916
9917static PyObject *
9918unicode_iter(PyObject *seq)
9919{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009920 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009921
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 if (!PyUnicode_Check(seq)) {
9923 PyErr_BadInternalCall();
9924 return NULL;
9925 }
9926 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9927 if (it == NULL)
9928 return NULL;
9929 it->it_index = 0;
9930 Py_INCREF(seq);
9931 it->it_seq = (PyUnicodeObject *)seq;
9932 _PyObject_GC_TRACK(it);
9933 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009934}
9935
Martin v. Löwis5b222132007-06-10 09:51:05 +00009936size_t
9937Py_UNICODE_strlen(const Py_UNICODE *u)
9938{
9939 int res = 0;
9940 while(*u++)
9941 res++;
9942 return res;
9943}
9944
9945Py_UNICODE*
9946Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9947{
9948 Py_UNICODE *u = s1;
9949 while ((*u++ = *s2++));
9950 return s1;
9951}
9952
9953Py_UNICODE*
9954Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9955{
9956 Py_UNICODE *u = s1;
9957 while ((*u++ = *s2++))
9958 if (n-- == 0)
9959 break;
9960 return s1;
9961}
9962
9963int
9964Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9965{
9966 while (*s1 && *s2 && *s1 == *s2)
9967 s1++, s2++;
9968 if (*s1 && *s2)
9969 return (*s1 < *s2) ? -1 : +1;
9970 if (*s1)
9971 return 1;
9972 if (*s2)
9973 return -1;
9974 return 0;
9975}
9976
Victor Stinneref8d95c2010-08-16 22:03:11 +00009977int
9978Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9979{
9980 register Py_UNICODE u1, u2;
9981 for (; n != 0; n--) {
9982 u1 = *s1;
9983 u2 = *s2;
9984 if (u1 != u2)
9985 return (u1 < u2) ? -1 : +1;
9986 if (u1 == '\0')
9987 return 0;
9988 s1++;
9989 s2++;
9990 }
9991 return 0;
9992}
9993
Martin v. Löwis5b222132007-06-10 09:51:05 +00009994Py_UNICODE*
9995Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9996{
9997 const Py_UNICODE *p;
9998 for (p = s; *p; p++)
9999 if (*p == c)
10000 return (Py_UNICODE*)p;
10001 return NULL;
10002}
10003
Victor Stinner331ea922010-08-10 16:37:20 +000010004Py_UNICODE*
10005Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10006{
10007 const Py_UNICODE *p;
10008 p = s + Py_UNICODE_strlen(s);
10009 while (p != s) {
10010 p--;
10011 if (*p == c)
10012 return (Py_UNICODE*)p;
10013 }
10014 return NULL;
10015}
10016
Martin v. Löwis5b222132007-06-10 09:51:05 +000010017
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010018#ifdef __cplusplus
10019}
10020#endif