blob: de92787cc69e8b34dedb40aeaa652c13e73b1755 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001297 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1298 1 on success. */
1299static int
1300normalize_encoding(const char *encoding,
1301 char *lower,
1302 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001305 char *l;
1306 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 e = encoding;
1309 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001310 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001311 while (*e) {
1312 if (l == l_end)
1313 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001314 if (ISUPPER(*e)) {
1315 *l++ = TOLOWER(*e++);
1316 }
1317 else if (*e == '_') {
1318 *l++ = '-';
1319 e++;
1320 }
1321 else {
1322 *l++ = *e++;
1323 }
1324 }
1325 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001326 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327}
1328
1329PyObject *PyUnicode_Decode(const char *s,
1330 Py_ssize_t size,
1331 const char *encoding,
1332 const char *errors)
1333{
1334 PyObject *buffer = NULL, *unicode;
1335 Py_buffer info;
1336 char lower[11]; /* Enough for any encoding shortcut */
1337
1338 if (encoding == NULL)
1339 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001340
1341 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001342 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1343 if (strcmp(lower, "utf-8") == 0)
1344 return PyUnicode_DecodeUTF8(s, size, errors);
1345 else if ((strcmp(lower, "latin-1") == 0) ||
1346 (strcmp(lower, "iso-8859-1") == 0))
1347 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001349 else if (strcmp(lower, "mbcs") == 0)
1350 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001351#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001352 else if (strcmp(lower, "ascii") == 0)
1353 return PyUnicode_DecodeASCII(s, size, errors);
1354 else if (strcmp(lower, "utf-16") == 0)
1355 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1356 else if (strcmp(lower, "utf-32") == 0)
1357 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001361 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001362 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001363 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001364 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (buffer == NULL)
1366 goto onError;
1367 unicode = PyCodec_Decode(buffer, encoding, errors);
1368 if (unicode == NULL)
1369 goto onError;
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001372 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 Py_DECREF(unicode);
1375 goto onError;
1376 }
1377 Py_DECREF(buffer);
1378 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_XDECREF(buffer);
1382 return NULL;
1383}
1384
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1386 const char *encoding,
1387 const char *errors)
1388{
1389 PyObject *v;
1390
1391 if (!PyUnicode_Check(unicode)) {
1392 PyErr_BadArgument();
1393 goto onError;
1394 }
1395
1396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001398
1399 /* Decode via the codec registry */
1400 v = PyCodec_Decode(unicode, encoding, errors);
1401 if (v == NULL)
1402 goto onError;
1403 return v;
1404
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001406 return NULL;
1407}
1408
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 if (!PyUnicode_Check(v)) {
1428 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001429 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001430 Py_TYPE(v)->tp_name);
1431 Py_DECREF(v);
1432 goto onError;
1433 }
1434 return v;
1435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001437 return NULL;
1438}
1439
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 Py_ssize_t size,
1442 const char *encoding,
1443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 unicode = PyUnicode_FromUnicode(s, size);
1448 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1451 Py_DECREF(unicode);
1452 return v;
1453}
1454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 return v;
1474
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001476 return NULL;
1477}
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1480{
1481 if (Py_FileSystemDefaultEncoding)
1482 return PyUnicode_AsEncodedString(unicode,
1483 Py_FileSystemDefaultEncoding,
1484 "surrogateescape");
1485 else
1486 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1487 PyUnicode_GET_SIZE(unicode),
1488 "surrogateescape");
1489}
1490
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1492 const char *encoding,
1493 const char *errors)
1494{
1495 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001496 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001497
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
Fred Drakee4315f52000-05-09 19:53:39 +00001502
Tim Petersced69f82003-09-16 20:30:58 +00001503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001505
1506 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001507 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1508 if (strcmp(lower, "utf-8") == 0)
1509 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1510 PyUnicode_GET_SIZE(unicode),
1511 errors);
1512 else if ((strcmp(lower, "latin-1") == 0) ||
1513 (strcmp(lower, "iso-8859-1") == 0))
1514 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001517#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001518 else if (strcmp(lower, "mbcs") == 0)
1519 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1520 PyUnicode_GET_SIZE(unicode),
1521 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001522#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001523 else if (strcmp(lower, "ascii") == 0)
1524 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1525 PyUnicode_GET_SIZE(unicode),
1526 errors);
1527 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001528 /* During bootstrap, we may need to find the encodings
1529 package, to load the file system encoding, and require the
1530 file system encoding in order to load the encodings
1531 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001532
Victor Stinner59e62db2010-05-15 13:14:32 +00001533 Break out of this dependency by assuming that the path to
1534 the encodings module is ASCII-only. XXX could try wcstombs
1535 instead, if the file system encoding is the locale's
1536 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001537 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001538 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1539 !PyThreadState_GET()->interp->codecs_initialized)
1540 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1541 PyUnicode_GET_SIZE(unicode),
1542 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543
1544 /* Encode via the codec registry */
1545 v = PyCodec_Encode(unicode, encoding, errors);
1546 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001547 return NULL;
1548
1549 /* The normal path */
1550 if (PyBytes_Check(v))
1551 return v;
1552
1553 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001554 if (PyByteArray_Check(v)) {
1555 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001556 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001557 PyOS_snprintf(msg, sizeof(msg),
1558 "encoder %s returned buffer instead of bytes",
1559 encoding);
1560 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001561 Py_DECREF(v);
1562 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001564
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001565 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1566 Py_DECREF(v);
1567 return b;
1568 }
1569
1570 PyErr_Format(PyExc_TypeError,
1571 "encoder did not return a bytes object (type=%.400s)",
1572 Py_TYPE(v)->tp_name);
1573 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001574 return NULL;
1575}
1576
1577PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1578 const char *encoding,
1579 const char *errors)
1580{
1581 PyObject *v;
1582
1583 if (!PyUnicode_Check(unicode)) {
1584 PyErr_BadArgument();
1585 goto onError;
1586 }
1587
1588 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001590
1591 /* Encode via the codec registry */
1592 v = PyCodec_Encode(unicode, encoding, errors);
1593 if (v == NULL)
1594 goto onError;
1595 if (!PyUnicode_Check(v)) {
1596 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001597 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001598 Py_TYPE(v)->tp_name);
1599 Py_DECREF(v);
1600 goto onError;
1601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001603
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 return NULL;
1606}
1607
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001608PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001609 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001610{
1611 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001612 if (v)
1613 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001614 if (errors != NULL)
1615 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001616 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001617 PyUnicode_GET_SIZE(unicode),
1618 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001619 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001620 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001621 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001622 return v;
1623}
1624
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001625PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001626PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001627 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001628 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1629}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630
Christian Heimes5894ba72007-11-04 11:43:14 +00001631PyObject*
1632PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1633{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001634 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1635 can be undefined. If it is case, decode using UTF-8. The following assumes
1636 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1637 bootstrapping process where the codecs aren't ready yet.
1638 */
1639 if (Py_FileSystemDefaultEncoding) {
1640#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001641 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001642 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001643 }
1644#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001645 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001646 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001647 }
1648#endif
1649 return PyUnicode_Decode(s, size,
1650 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001651 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001652 }
1653 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001654 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001655 }
1656}
1657
Martin v. Löwis011e8422009-05-05 04:43:17 +00001658/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001659 system encoding. The addr param must be a PyObject**.
1660 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001661
1662int
1663PyUnicode_FSConverter(PyObject* arg, void* addr)
1664{
1665 PyObject *output = NULL;
1666 Py_ssize_t size;
1667 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001668 if (arg == NULL) {
1669 Py_DECREF(*(PyObject**)addr);
1670 return 1;
1671 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001672 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001673 output = arg;
1674 Py_INCREF(output);
1675 }
1676 else {
1677 arg = PyUnicode_FromObject(arg);
1678 if (!arg)
1679 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001680 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001681 Py_DECREF(arg);
1682 if (!output)
1683 return 0;
1684 if (!PyBytes_Check(output)) {
1685 Py_DECREF(output);
1686 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1687 return 0;
1688 }
1689 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001690 size = PyBytes_GET_SIZE(output);
1691 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001692 if (size != strlen(data)) {
1693 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1694 Py_DECREF(output);
1695 return 0;
1696 }
1697 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001698 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001699}
1700
1701
Martin v. Löwis5b222132007-06-10 09:51:05 +00001702char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001703_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001704{
Christian Heimesf3863112007-11-22 07:46:41 +00001705 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001706 if (!PyUnicode_Check(unicode)) {
1707 PyErr_BadArgument();
1708 return NULL;
1709 }
Christian Heimesf3863112007-11-22 07:46:41 +00001710 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1711 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001712 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001713 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001714 *psize = PyBytes_GET_SIZE(bytes);
1715 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001716}
1717
1718char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001719_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001720{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001721 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001722}
1723
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1725{
1726 if (!PyUnicode_Check(unicode)) {
1727 PyErr_BadArgument();
1728 goto onError;
1729 }
1730 return PyUnicode_AS_UNICODE(unicode);
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 return NULL;
1734}
1735
Martin v. Löwis18e16552006-02-15 17:27:45 +00001736Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737{
1738 if (!PyUnicode_Check(unicode)) {
1739 PyErr_BadArgument();
1740 goto onError;
1741 }
1742 return PyUnicode_GET_SIZE(unicode);
1743
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return -1;
1746}
1747
Thomas Wouters78890102000-07-22 19:25:51 +00001748const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001749{
1750 return unicode_default_encoding;
1751}
1752
1753int PyUnicode_SetDefaultEncoding(const char *encoding)
1754{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001755 if (strcmp(encoding, unicode_default_encoding) != 0) {
1756 PyErr_Format(PyExc_ValueError,
1757 "Can only set default encoding to %s",
1758 unicode_default_encoding);
1759 return -1;
1760 }
Fred Drakee4315f52000-05-09 19:53:39 +00001761 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001762}
1763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764/* error handling callback helper:
1765 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001766 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 and adjust various state variables.
1768 return 0 on success, -1 on error
1769*/
1770
1771static
1772int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 const char *encoding, const char *reason,
1774 const char **input, const char **inend, Py_ssize_t *startinpos,
1775 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1776 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001778 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779
1780 PyObject *restuple = NULL;
1781 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001783 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784 Py_ssize_t requiredsize;
1785 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001787 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 int res = -1;
1790
1791 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 *errorHandler = PyCodec_LookupError(errors);
1793 if (*errorHandler == NULL)
1794 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 }
1796
1797 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1800 if (*exceptionObject == NULL)
1801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802 }
1803 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1805 goto onError;
1806 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1807 goto onError;
1808 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1809 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 }
1811
1812 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1813 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001816 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 }
1819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001821
1822 /* Copy back the bytes variables, which might have been modified by the
1823 callback */
1824 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1825 if (!inputobj)
1826 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001827 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001828 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001829 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001830 *input = PyBytes_AS_STRING(inputobj);
1831 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001832 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001833 /* we can DECREF safely, as the exception has another reference,
1834 so the object won't go away. */
1835 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001839 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1841 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843
1844 /* need more space? (at least enough for what we
1845 have+the replacement+the rest of the string (starting
1846 at the new input position), so we won't have to check space
1847 when there are no errors in the rest of the string) */
1848 repptr = PyUnicode_AS_UNICODE(repunicode);
1849 repsize = PyUnicode_GET_SIZE(repunicode);
1850 requiredsize = *outpos + repsize + insize-newpos;
1851 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 if (requiredsize<2*outsize)
1853 requiredsize = 2*outsize;
1854 if (_PyUnicode_Resize(output, requiredsize) < 0)
1855 goto onError;
1856 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 }
1858 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001859 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860 Py_UNICODE_COPY(*outptr, repptr, repsize);
1861 *outptr += repsize;
1862 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 /* we made it! */
1865 res = 0;
1866
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_XDECREF(restuple);
1869 return res;
1870}
1871
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872/* --- UTF-7 Codec -------------------------------------------------------- */
1873
Antoine Pitrou244651a2009-05-04 18:56:13 +00001874/* See RFC2152 for details. We encode conservatively and decode liberally. */
1875
1876/* Three simple macros defining base-64. */
1877
1878/* Is c a base-64 character? */
1879
1880#define IS_BASE64(c) \
1881 (((c) >= 'A' && (c) <= 'Z') || \
1882 ((c) >= 'a' && (c) <= 'z') || \
1883 ((c) >= '0' && (c) <= '9') || \
1884 (c) == '+' || (c) == '/')
1885
1886/* given that c is a base-64 character, what is its base-64 value? */
1887
1888#define FROM_BASE64(c) \
1889 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1890 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1891 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1892 (c) == '+' ? 62 : 63)
1893
1894/* What is the base-64 character of the bottom 6 bits of n? */
1895
1896#define TO_BASE64(n) \
1897 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1898
1899/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1900 * decoded as itself. We are permissive on decoding; the only ASCII
1901 * byte not decoding to itself is the + which begins a base64
1902 * string. */
1903
1904#define DECODE_DIRECT(c) \
1905 ((c) <= 127 && (c) != '+')
1906
1907/* The UTF-7 encoder treats ASCII characters differently according to
1908 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1909 * the above). See RFC2152. This array identifies these different
1910 * sets:
1911 * 0 : "Set D"
1912 * alphanumeric and '(),-./:?
1913 * 1 : "Set O"
1914 * !"#$%&*;<=>@[]^_`{|}
1915 * 2 : "whitespace"
1916 * ht nl cr sp
1917 * 3 : special (must be base64 encoded)
1918 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1919 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920
Tim Petersced69f82003-09-16 20:30:58 +00001921static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001922char utf7_category[128] = {
1923/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1924 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1925/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1926 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1927/* sp ! " # $ % & ' ( ) * + , - . / */
1928 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1929/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1930 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1931/* @ A B C D E F G H I J K L M N O */
1932 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1933/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1935/* ` a b c d e f g h i j k l m n o */
1936 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1937/* p q r s t u v w x y z { | } ~ del */
1938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939};
1940
Antoine Pitrou244651a2009-05-04 18:56:13 +00001941/* ENCODE_DIRECT: this character should be encoded as itself. The
1942 * answer depends on whether we are encoding set O as itself, and also
1943 * on whether we are encoding whitespace as itself. RFC2152 makes it
1944 * clear that the answers to these questions vary between
1945 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001946
Antoine Pitrou244651a2009-05-04 18:56:13 +00001947#define ENCODE_DIRECT(c, directO, directWS) \
1948 ((c) < 128 && (c) > 0 && \
1949 ((utf7_category[(c)] == 0) || \
1950 (directWS && (utf7_category[(c)] == 2)) || \
1951 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001953PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 Py_ssize_t size,
1955 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001957 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1958}
1959
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960/* The decoder. The only state we preserve is our read position,
1961 * i.e. how many characters we have consumed. So if we end in the
1962 * middle of a shift sequence we have to back off the read position
1963 * and the output to the beginning of the sequence, otherwise we lose
1964 * all the shift state (seen bits, number of bits seen, high
1965 * surrogate). */
1966
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001967PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001968 Py_ssize_t size,
1969 const char *errors,
1970 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001972 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t startinpos;
1974 Py_ssize_t endinpos;
1975 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001976 const char *e;
1977 PyUnicodeObject *unicode;
1978 Py_UNICODE *p;
1979 const char *errmsg = "";
1980 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001981 Py_UNICODE *shiftOutStart;
1982 unsigned int base64bits = 0;
1983 unsigned long base64buffer = 0;
1984 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 PyObject *errorHandler = NULL;
1986 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987
1988 unicode = _PyUnicode_New(size);
1989 if (!unicode)
1990 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001991 if (size == 0) {
1992 if (consumed)
1993 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001995 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996
1997 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001998 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001999 e = s + size;
2000
2001 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002004 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005
Antoine Pitrou244651a2009-05-04 18:56:13 +00002006 if (inShift) { /* in a base-64 section */
2007 if (IS_BASE64(ch)) { /* consume a base-64 character */
2008 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2009 base64bits += 6;
2010 s++;
2011 if (base64bits >= 16) {
2012 /* we have enough bits for a UTF-16 value */
2013 Py_UNICODE outCh = (Py_UNICODE)
2014 (base64buffer >> (base64bits-16));
2015 base64bits -= 16;
2016 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2017 if (surrogate) {
2018 /* expecting a second surrogate */
2019 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2020#ifdef Py_UNICODE_WIDE
2021 *p++ = (((surrogate & 0x3FF)<<10)
2022 | (outCh & 0x3FF)) + 0x10000;
2023#else
2024 *p++ = surrogate;
2025 *p++ = outCh;
2026#endif
2027 surrogate = 0;
2028 }
2029 else {
2030 surrogate = 0;
2031 errmsg = "second surrogate missing";
2032 goto utf7Error;
2033 }
2034 }
2035 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2036 /* first surrogate */
2037 surrogate = outCh;
2038 }
2039 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2040 errmsg = "unexpected second surrogate";
2041 goto utf7Error;
2042 }
2043 else {
2044 *p++ = outCh;
2045 }
2046 }
2047 }
2048 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 inShift = 0;
2050 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002051 if (surrogate) {
2052 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002053 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002054 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002055 if (base64bits > 0) { /* left-over bits */
2056 if (base64bits >= 6) {
2057 /* We've seen at least one base-64 character */
2058 errmsg = "partial character in shift sequence";
2059 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 else {
2062 /* Some bits remain; they should be zero */
2063 if (base64buffer != 0) {
2064 errmsg = "non-zero padding bits in shift sequence";
2065 goto utf7Error;
2066 }
2067 }
2068 }
2069 if (ch != '-') {
2070 /* '-' is absorbed; other terminating
2071 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 *p++ = ch;
2073 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002074 }
2075 }
2076 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 s++; /* consume '+' */
2079 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 s++;
2081 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 }
2083 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002084 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 shiftOutStart = p;
2086 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002087 }
2088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 *p++ = ch;
2091 s++;
2092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093 else {
2094 startinpos = s-starts;
2095 s++;
2096 errmsg = "unexpected special character";
2097 goto utf7Error;
2098 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002099 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002100utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 outpos = p-PyUnicode_AS_UNICODE(unicode);
2102 endinpos = s-starts;
2103 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002104 errors, &errorHandler,
2105 "utf7", errmsg,
2106 &starts, &e, &startinpos, &endinpos, &exc, &s,
2107 &unicode, &outpos, &p))
2108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002109 }
2110
Antoine Pitrou244651a2009-05-04 18:56:13 +00002111 /* end of string */
2112
2113 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2114 /* if we're in an inconsistent state, that's an error */
2115 if (surrogate ||
2116 (base64bits >= 6) ||
2117 (base64bits > 0 && base64buffer != 0)) {
2118 outpos = p-PyUnicode_AS_UNICODE(unicode);
2119 endinpos = size;
2120 if (unicode_decode_call_errorhandler(
2121 errors, &errorHandler,
2122 "utf7", "unterminated shift sequence",
2123 &starts, &e, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
2126 if (s < e)
2127 goto restart;
2128 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130
2131 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002132 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133 if (inShift) {
2134 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002135 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136 }
2137 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002138 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002140 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002142 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 goto onError;
2144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 Py_XDECREF(errorHandler);
2146 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147 return (PyObject *)unicode;
2148
Benjamin Peterson29060642009-01-31 22:14:21 +00002149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 Py_XDECREF(errorHandler);
2151 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002152 Py_DECREF(unicode);
2153 return NULL;
2154}
2155
2156
2157PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002158 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159 int base64SetO,
2160 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002163 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002165 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002166 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002167 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002168 unsigned int base64bits = 0;
2169 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002170 char * out;
2171 char * start;
2172
2173 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002174 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002176 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002177 return PyErr_NoMemory();
2178
Antoine Pitrou244651a2009-05-04 18:56:13 +00002179 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002180 if (v == NULL)
2181 return NULL;
2182
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002183 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002184 for (;i < size; ++i) {
2185 Py_UNICODE ch = s[i];
2186
Antoine Pitrou244651a2009-05-04 18:56:13 +00002187 if (inShift) {
2188 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2189 /* shifting out */
2190 if (base64bits) { /* output remaining bits */
2191 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2192 base64buffer = 0;
2193 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194 }
2195 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196 /* Characters not in the BASE64 set implicitly unshift the sequence
2197 so no '-' is required, except if the character is itself a '-' */
2198 if (IS_BASE64(ch) || ch == '-') {
2199 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201 *out++ = (char) ch;
2202 }
2203 else {
2204 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207 else { /* not in a shift sequence */
2208 if (ch == '+') {
2209 *out++ = '+';
2210 *out++ = '-';
2211 }
2212 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2213 *out++ = (char) ch;
2214 }
2215 else {
2216 *out++ = '+';
2217 inShift = 1;
2218 goto encode_char;
2219 }
2220 }
2221 continue;
2222encode_char:
2223#ifdef Py_UNICODE_WIDE
2224 if (ch >= 0x10000) {
2225 /* code first surrogate */
2226 base64bits += 16;
2227 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2228 while (base64bits >= 6) {
2229 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2230 base64bits -= 6;
2231 }
2232 /* prepare second surrogate */
2233 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2234 }
2235#endif
2236 base64bits += 16;
2237 base64buffer = (base64buffer << 16) | ch;
2238 while (base64bits >= 6) {
2239 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2240 base64bits -= 6;
2241 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002242 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002243 if (base64bits)
2244 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2245 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002247 if (_PyBytes_Resize(&v, out - start) < 0)
2248 return NULL;
2249 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250}
2251
Antoine Pitrou244651a2009-05-04 18:56:13 +00002252#undef IS_BASE64
2253#undef FROM_BASE64
2254#undef TO_BASE64
2255#undef DECODE_DIRECT
2256#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258/* --- UTF-8 Codec -------------------------------------------------------- */
2259
Tim Petersced69f82003-09-16 20:30:58 +00002260static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261char utf8_code_length[256] = {
2262 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2263 illegal prefix. see RFC 2279 for details */
2264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2272 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2276 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2277 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2278 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2279 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2280};
2281
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 Py_ssize_t size,
2284 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
Walter Dörwald69652032004-09-07 20:24:22 +00002286 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2287}
2288
Antoine Pitrouab868312009-01-10 15:40:25 +00002289/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2290#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2291
2292/* Mask to quickly check whether a C 'long' contains a
2293 non-ASCII, UTF8-encoded char. */
2294#if (SIZEOF_LONG == 8)
2295# define ASCII_CHAR_MASK 0x8080808080808080L
2296#elif (SIZEOF_LONG == 4)
2297# define ASCII_CHAR_MASK 0x80808080L
2298#else
2299# error C 'long' size should be either 4 or 8!
2300#endif
2301
Walter Dörwald69652032004-09-07 20:24:22 +00002302PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 Py_ssize_t size,
2304 const char *errors,
2305 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002309 Py_ssize_t startinpos;
2310 Py_ssize_t endinpos;
2311 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002312 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 PyUnicodeObject *unicode;
2314 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002315 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 PyObject *errorHandler = NULL;
2317 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 /* Note: size will always be longer than the resulting Unicode
2320 character count */
2321 unicode = _PyUnicode_New(size);
2322 if (!unicode)
2323 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002324 if (size == 0) {
2325 if (consumed)
2326 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329
2330 /* Unpack UTF-8 encoded data */
2331 p = unicode->str;
2332 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002333 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
2335 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002336 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
2338 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002339 /* Fast path for runs of ASCII characters. Given that common UTF-8
2340 input will consist of an overwhelming majority of ASCII
2341 characters, we try to optimize for this case by checking
2342 as many characters as a C 'long' can contain.
2343 First, check if we can do an aligned read, as most CPUs have
2344 a penalty for unaligned reads.
2345 */
2346 if (!((size_t) s & LONG_PTR_MASK)) {
2347 /* Help register allocation */
2348 register const char *_s = s;
2349 register Py_UNICODE *_p = p;
2350 while (_s < aligned_end) {
2351 /* Read a whole long at a time (either 4 or 8 bytes),
2352 and do a fast unrolled copy if it only contains ASCII
2353 characters. */
2354 unsigned long data = *(unsigned long *) _s;
2355 if (data & ASCII_CHAR_MASK)
2356 break;
2357 _p[0] = (unsigned char) _s[0];
2358 _p[1] = (unsigned char) _s[1];
2359 _p[2] = (unsigned char) _s[2];
2360 _p[3] = (unsigned char) _s[3];
2361#if (SIZEOF_LONG == 8)
2362 _p[4] = (unsigned char) _s[4];
2363 _p[5] = (unsigned char) _s[5];
2364 _p[6] = (unsigned char) _s[6];
2365 _p[7] = (unsigned char) _s[7];
2366#endif
2367 _s += SIZEOF_LONG;
2368 _p += SIZEOF_LONG;
2369 }
2370 s = _s;
2371 p = _p;
2372 if (s == e)
2373 break;
2374 ch = (unsigned char)*s;
2375 }
2376 }
2377
2378 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002379 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380 s++;
2381 continue;
2382 }
2383
2384 n = utf8_code_length[ch];
2385
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002386 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 if (consumed)
2388 break;
2389 else {
2390 errmsg = "unexpected end of data";
2391 startinpos = s-starts;
2392 endinpos = size;
2393 goto utf8Error;
2394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396
2397 switch (n) {
2398
2399 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 startinpos = s-starts;
2402 endinpos = startinpos+1;
2403 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404
2405 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002406 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 startinpos = s-starts;
2408 endinpos = startinpos+1;
2409 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410
2411 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 if ((s[1] & 0xc0) != 0x80) {
2413 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 startinpos = s-starts;
2415 endinpos = startinpos+2;
2416 goto utf8Error;
2417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002419 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 startinpos = s-starts;
2421 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002422 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 goto utf8Error;
2424 }
2425 else
2426 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 break;
2428
2429 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002430 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002431 (s[2] & 0xc0) != 0x80) {
2432 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002433 startinpos = s-starts;
2434 endinpos = startinpos+3;
2435 goto utf8Error;
2436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002438 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002439 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 startinpos = s-starts;
2441 endinpos = startinpos+3;
2442 goto utf8Error;
2443 }
2444 else
2445 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002446 break;
2447
2448 case 4:
2449 if ((s[1] & 0xc0) != 0x80 ||
2450 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 (s[3] & 0xc0) != 0x80) {
2452 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 startinpos = s-starts;
2454 endinpos = startinpos+4;
2455 goto utf8Error;
2456 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002457 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002459 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002461 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002462 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 UTF-16 */
2464 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002465 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 startinpos = s-starts;
2467 endinpos = startinpos+4;
2468 goto utf8Error;
2469 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002470#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002471 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002472#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002473 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002474
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002475 /* translate from 10000..10FFFF to 0..FFFF */
2476 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002477
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002478 /* high surrogate = top 10 bits added to D800 */
2479 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002480
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002481 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002482 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002483#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 break;
2485
2486 default:
2487 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 startinpos = s-starts;
2490 endinpos = startinpos+n;
2491 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
2493 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002495
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 utf8Error:
2497 outpos = p-PyUnicode_AS_UNICODE(unicode);
2498 if (unicode_decode_call_errorhandler(
2499 errors, &errorHandler,
2500 "utf8", errmsg,
2501 &starts, &e, &startinpos, &endinpos, &exc, &s,
2502 &unicode, &outpos, &p))
2503 goto onError;
2504 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 }
Walter Dörwald69652032004-09-07 20:24:22 +00002506 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002507 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002510 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 goto onError;
2512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002513 Py_XDECREF(errorHandler);
2514 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 return (PyObject *)unicode;
2516
Benjamin Peterson29060642009-01-31 22:14:21 +00002517 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 Py_XDECREF(errorHandler);
2519 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 Py_DECREF(unicode);
2521 return NULL;
2522}
2523
Antoine Pitrouab868312009-01-10 15:40:25 +00002524#undef ASCII_CHAR_MASK
2525
2526
Tim Peters602f7402002-04-27 18:03:26 +00002527/* Allocation strategy: if the string is short, convert into a stack buffer
2528 and allocate exactly as much space needed at the end. Else allocate the
2529 maximum possible needed (4 result bytes per Unicode character), and return
2530 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002531*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002532PyObject *
2533PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 Py_ssize_t size,
2535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536{
Tim Peters602f7402002-04-27 18:03:26 +00002537#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002538
Guido van Rossum98297ee2007-11-06 21:34:58 +00002539 Py_ssize_t i; /* index into s of next input byte */
2540 PyObject *result; /* result string object */
2541 char *p; /* next free byte in output buffer */
2542 Py_ssize_t nallocated; /* number of result bytes allocated */
2543 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002544 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002545 PyObject *errorHandler = NULL;
2546 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002547
Tim Peters602f7402002-04-27 18:03:26 +00002548 assert(s != NULL);
2549 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
Tim Peters602f7402002-04-27 18:03:26 +00002551 if (size <= MAX_SHORT_UNICHARS) {
2552 /* Write into the stack buffer; nallocated can't overflow.
2553 * At the end, we'll allocate exactly as much heap space as it
2554 * turns out we need.
2555 */
2556 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002557 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002558 p = stackbuf;
2559 }
2560 else {
2561 /* Overallocate on the heap, and give the excess back at the end. */
2562 nallocated = size * 4;
2563 if (nallocated / 4 != size) /* overflow! */
2564 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002565 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002566 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002567 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002568 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002569 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002570
Tim Peters602f7402002-04-27 18:03:26 +00002571 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002572 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002573
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002574 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002575 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002577
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002579 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002580 *p++ = (char)(0xc0 | (ch >> 6));
2581 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002582 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002583#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002584 /* Special case: check for high and low surrogate */
2585 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2586 Py_UCS4 ch2 = s[i];
2587 /* Combine the two surrogates to form a UCS4 value */
2588 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2589 i++;
2590
2591 /* Encode UCS4 Unicode ordinals */
2592 *p++ = (char)(0xf0 | (ch >> 18));
2593 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002594 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2595 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002596 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002597#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002598 Py_ssize_t newpos;
2599 PyObject *rep;
2600 Py_ssize_t repsize, k;
2601 rep = unicode_encode_call_errorhandler
2602 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2603 s, size, &exc, i-1, i, &newpos);
2604 if (!rep)
2605 goto error;
2606
2607 if (PyBytes_Check(rep))
2608 repsize = PyBytes_GET_SIZE(rep);
2609 else
2610 repsize = PyUnicode_GET_SIZE(rep);
2611
2612 if (repsize > 4) {
2613 Py_ssize_t offset;
2614
2615 if (result == NULL)
2616 offset = p - stackbuf;
2617 else
2618 offset = p - PyBytes_AS_STRING(result);
2619
2620 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2621 /* integer overflow */
2622 PyErr_NoMemory();
2623 goto error;
2624 }
2625 nallocated += repsize - 4;
2626 if (result != NULL) {
2627 if (_PyBytes_Resize(&result, nallocated) < 0)
2628 goto error;
2629 } else {
2630 result = PyBytes_FromStringAndSize(NULL, nallocated);
2631 if (result == NULL)
2632 goto error;
2633 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2634 }
2635 p = PyBytes_AS_STRING(result) + offset;
2636 }
2637
2638 if (PyBytes_Check(rep)) {
2639 char *prep = PyBytes_AS_STRING(rep);
2640 for(k = repsize; k > 0; k--)
2641 *p++ = *prep++;
2642 } else /* rep is unicode */ {
2643 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2644 Py_UNICODE c;
2645
2646 for(k=0; k<repsize; k++) {
2647 c = prep[k];
2648 if (0x80 <= c) {
2649 raise_encode_exception(&exc, "utf-8", s, size,
2650 i-1, i, "surrogates not allowed");
2651 goto error;
2652 }
2653 *p++ = (char)prep[k];
2654 }
2655 }
2656 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002657#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002658 }
Victor Stinner445a6232010-04-22 20:01:57 +00002659#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002660 } else if (ch < 0x10000) {
2661 *p++ = (char)(0xe0 | (ch >> 12));
2662 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2663 *p++ = (char)(0x80 | (ch & 0x3f));
2664 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002665 /* Encode UCS4 Unicode ordinals */
2666 *p++ = (char)(0xf0 | (ch >> 18));
2667 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2668 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2669 *p++ = (char)(0x80 | (ch & 0x3f));
2670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002672
Guido van Rossum98297ee2007-11-06 21:34:58 +00002673 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002674 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002675 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002676 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002677 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002678 }
2679 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002680 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002681 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002682 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002683 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002684 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002685 Py_XDECREF(errorHandler);
2686 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002687 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002688 error:
2689 Py_XDECREF(errorHandler);
2690 Py_XDECREF(exc);
2691 Py_XDECREF(result);
2692 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002693
Tim Peters602f7402002-04-27 18:03:26 +00002694#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695}
2696
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2698{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (!PyUnicode_Check(unicode)) {
2700 PyErr_BadArgument();
2701 return NULL;
2702 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002703 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002704 PyUnicode_GET_SIZE(unicode),
2705 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706}
2707
Walter Dörwald41980ca2007-08-16 21:55:45 +00002708/* --- UTF-32 Codec ------------------------------------------------------- */
2709
2710PyObject *
2711PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 Py_ssize_t size,
2713 const char *errors,
2714 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002715{
2716 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2717}
2718
2719PyObject *
2720PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 Py_ssize_t size,
2722 const char *errors,
2723 int *byteorder,
2724 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002725{
2726 const char *starts = s;
2727 Py_ssize_t startinpos;
2728 Py_ssize_t endinpos;
2729 Py_ssize_t outpos;
2730 PyUnicodeObject *unicode;
2731 Py_UNICODE *p;
2732#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002733 int pairs = 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734#else
2735 const int pairs = 0;
2736#endif
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002737 const unsigned char *q, *e, *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002738 int bo = 0; /* assume native ordering by default */
2739 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740 /* Offsets from q for retrieving bytes in the right order. */
2741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742 int iorder[] = {0, 1, 2, 3};
2743#else
2744 int iorder[] = {3, 2, 1, 0};
2745#endif
2746 PyObject *errorHandler = NULL;
2747 PyObject *exc = NULL;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002748
Walter Dörwald41980ca2007-08-16 21:55:45 +00002749 q = (unsigned char *)s;
2750 e = q + size;
2751
2752 if (byteorder)
2753 bo = *byteorder;
2754
2755 /* Check for BOM marks (U+FEFF) in the input and adjust current
2756 byte order setting accordingly. In native mode, the leading BOM
2757 mark is skipped, in all other modes, it is copied to the output
2758 stream as-is (giving a ZWNBSP character). */
2759 if (bo == 0) {
2760 if (size >= 4) {
2761 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 if (bom == 0x0000FEFF) {
2765 q += 4;
2766 bo = -1;
2767 }
2768 else if (bom == 0xFFFE0000) {
2769 q += 4;
2770 bo = 1;
2771 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 if (bom == 0x0000FEFF) {
2774 q += 4;
2775 bo = 1;
2776 }
2777 else if (bom == 0xFFFE0000) {
2778 q += 4;
2779 bo = -1;
2780 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002781#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002783 }
2784
2785 if (bo == -1) {
2786 /* force LE */
2787 iorder[0] = 0;
2788 iorder[1] = 1;
2789 iorder[2] = 2;
2790 iorder[3] = 3;
2791 }
2792 else if (bo == 1) {
2793 /* force BE */
2794 iorder[0] = 3;
2795 iorder[1] = 2;
2796 iorder[2] = 1;
2797 iorder[3] = 0;
2798 }
2799
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002800 /* On narrow builds we split characters outside the BMP into two
2801 codepoints => count how much extra space we need. */
2802#ifndef Py_UNICODE_WIDE
2803 for (qq = q; qq < e; qq += 4)
2804 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2805 pairs++;
2806#endif
2807
2808 /* This might be one to much, because of a BOM */
2809 unicode = _PyUnicode_New((size+3)/4+pairs);
2810 if (!unicode)
2811 return NULL;
2812 if (size == 0)
2813 return (PyObject *)unicode;
2814
2815 /* Unpack UTF-32 encoded data */
2816 p = unicode->str;
2817
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 Py_UCS4 ch;
2820 /* remaining bytes at the end? (size should be divisible by 4) */
2821 if (e-q<4) {
2822 if (consumed)
2823 break;
2824 errmsg = "truncated data";
2825 startinpos = ((const char *)q)-starts;
2826 endinpos = ((const char *)e)-starts;
2827 goto utf32Error;
2828 /* The remaining input chars are ignored if the callback
2829 chooses to skip the input */
2830 }
2831 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2832 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002833
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 if (ch >= 0x110000)
2835 {
2836 errmsg = "codepoint not in range(0x110000)";
2837 startinpos = ((const char *)q)-starts;
2838 endinpos = startinpos+4;
2839 goto utf32Error;
2840 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002841#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 if (ch >= 0x10000)
2843 {
2844 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2845 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2846 }
2847 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002848#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 *p++ = ch;
2850 q += 4;
2851 continue;
2852 utf32Error:
2853 outpos = p-PyUnicode_AS_UNICODE(unicode);
2854 if (unicode_decode_call_errorhandler(
2855 errors, &errorHandler,
2856 "utf32", errmsg,
2857 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2858 &unicode, &outpos, &p))
2859 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002860 }
2861
2862 if (byteorder)
2863 *byteorder = bo;
2864
2865 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002867
2868 /* Adjust length */
2869 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2870 goto onError;
2871
2872 Py_XDECREF(errorHandler);
2873 Py_XDECREF(exc);
2874 return (PyObject *)unicode;
2875
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002877 Py_DECREF(unicode);
2878 Py_XDECREF(errorHandler);
2879 Py_XDECREF(exc);
2880 return NULL;
2881}
2882
2883PyObject *
2884PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 Py_ssize_t size,
2886 const char *errors,
2887 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002888{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002889 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002890 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002891 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002892#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002893 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002894#else
2895 const int pairs = 0;
2896#endif
2897 /* Offsets from p for storing byte pairs in the right order. */
2898#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2899 int iorder[] = {0, 1, 2, 3};
2900#else
2901 int iorder[] = {3, 2, 1, 0};
2902#endif
2903
Benjamin Peterson29060642009-01-31 22:14:21 +00002904#define STORECHAR(CH) \
2905 do { \
2906 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2907 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2908 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2909 p[iorder[0]] = (CH) & 0xff; \
2910 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002911 } while(0)
2912
2913 /* In narrow builds we can output surrogate pairs as one codepoint,
2914 so we need less space. */
2915#ifndef Py_UNICODE_WIDE
2916 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2918 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2919 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002920#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002921 nsize = (size - pairs + (byteorder == 0));
2922 bytesize = nsize * 4;
2923 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002925 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002926 if (v == NULL)
2927 return NULL;
2928
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002929 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002930 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002933 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002934
2935 if (byteorder == -1) {
2936 /* force LE */
2937 iorder[0] = 0;
2938 iorder[1] = 1;
2939 iorder[2] = 2;
2940 iorder[3] = 3;
2941 }
2942 else if (byteorder == 1) {
2943 /* force BE */
2944 iorder[0] = 3;
2945 iorder[1] = 2;
2946 iorder[2] = 1;
2947 iorder[3] = 0;
2948 }
2949
2950 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002952#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2954 Py_UCS4 ch2 = *s;
2955 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2956 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2957 s++;
2958 size--;
2959 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002960 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002961#endif
2962 STORECHAR(ch);
2963 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002964
2965 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002966 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002967#undef STORECHAR
2968}
2969
2970PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2971{
2972 if (!PyUnicode_Check(unicode)) {
2973 PyErr_BadArgument();
2974 return NULL;
2975 }
2976 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 PyUnicode_GET_SIZE(unicode),
2978 NULL,
2979 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002980}
2981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982/* --- UTF-16 Codec ------------------------------------------------------- */
2983
Tim Peters772747b2001-08-09 22:21:55 +00002984PyObject *
2985PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 Py_ssize_t size,
2987 const char *errors,
2988 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989{
Walter Dörwald69652032004-09-07 20:24:22 +00002990 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2991}
2992
Antoine Pitrouab868312009-01-10 15:40:25 +00002993/* Two masks for fast checking of whether a C 'long' may contain
2994 UTF16-encoded surrogate characters. This is an efficient heuristic,
2995 assuming that non-surrogate characters with a code point >= 0x8000 are
2996 rare in most input.
2997 FAST_CHAR_MASK is used when the input is in native byte ordering,
2998 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002999*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003000#if (SIZEOF_LONG == 8)
3001# define FAST_CHAR_MASK 0x8000800080008000L
3002# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3003#elif (SIZEOF_LONG == 4)
3004# define FAST_CHAR_MASK 0x80008000L
3005# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3006#else
3007# error C 'long' size should be either 4 or 8!
3008#endif
3009
Walter Dörwald69652032004-09-07 20:24:22 +00003010PyObject *
3011PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 Py_ssize_t size,
3013 const char *errors,
3014 int *byteorder,
3015 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t startinpos;
3019 Py_ssize_t endinpos;
3020 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 PyUnicodeObject *unicode;
3022 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003023 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003024 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003025 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003026 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003027 /* Offsets from q for retrieving byte pairs in the right order. */
3028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3029 int ihi = 1, ilo = 0;
3030#else
3031 int ihi = 0, ilo = 1;
3032#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 PyObject *errorHandler = NULL;
3034 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035
3036 /* Note: size will always be longer than the resulting Unicode
3037 character count */
3038 unicode = _PyUnicode_New(size);
3039 if (!unicode)
3040 return NULL;
3041 if (size == 0)
3042 return (PyObject *)unicode;
3043
3044 /* Unpack UTF-16 encoded data */
3045 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003046 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003047 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048
3049 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003050 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003052 /* Check for BOM marks (U+FEFF) in the input and adjust current
3053 byte order setting accordingly. In native mode, the leading BOM
3054 mark is skipped, in all other modes, it is copied to the output
3055 stream as-is (giving a ZWNBSP character). */
3056 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003057 if (size >= 2) {
3058 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003059#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 if (bom == 0xFEFF) {
3061 q += 2;
3062 bo = -1;
3063 }
3064 else if (bom == 0xFFFE) {
3065 q += 2;
3066 bo = 1;
3067 }
Tim Petersced69f82003-09-16 20:30:58 +00003068#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 if (bom == 0xFEFF) {
3070 q += 2;
3071 bo = 1;
3072 }
3073 else if (bom == 0xFFFE) {
3074 q += 2;
3075 bo = -1;
3076 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003077#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080
Tim Peters772747b2001-08-09 22:21:55 +00003081 if (bo == -1) {
3082 /* force LE */
3083 ihi = 1;
3084 ilo = 0;
3085 }
3086 else if (bo == 1) {
3087 /* force BE */
3088 ihi = 0;
3089 ilo = 1;
3090 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003091#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3092 native_ordering = ilo < ihi;
3093#else
3094 native_ordering = ilo > ihi;
3095#endif
Tim Peters772747b2001-08-09 22:21:55 +00003096
Antoine Pitrouab868312009-01-10 15:40:25 +00003097 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003098 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003100 /* First check for possible aligned read of a C 'long'. Unaligned
3101 reads are more expensive, better to defer to another iteration. */
3102 if (!((size_t) q & LONG_PTR_MASK)) {
3103 /* Fast path for runs of non-surrogate chars. */
3104 register const unsigned char *_q = q;
3105 Py_UNICODE *_p = p;
3106 if (native_ordering) {
3107 /* Native ordering is simple: as long as the input cannot
3108 possibly contain a surrogate char, do an unrolled copy
3109 of several 16-bit code points to the target object.
3110 The non-surrogate check is done on several input bytes
3111 at a time (as many as a C 'long' can contain). */
3112 while (_q < aligned_end) {
3113 unsigned long data = * (unsigned long *) _q;
3114 if (data & FAST_CHAR_MASK)
3115 break;
3116 _p[0] = ((unsigned short *) _q)[0];
3117 _p[1] = ((unsigned short *) _q)[1];
3118#if (SIZEOF_LONG == 8)
3119 _p[2] = ((unsigned short *) _q)[2];
3120 _p[3] = ((unsigned short *) _q)[3];
3121#endif
3122 _q += SIZEOF_LONG;
3123 _p += SIZEOF_LONG / 2;
3124 }
3125 }
3126 else {
3127 /* Byteswapped ordering is similar, but we must decompose
3128 the copy bytewise, and take care of zero'ing out the
3129 upper bytes if the target object is in 32-bit units
3130 (that is, in UCS-4 builds). */
3131 while (_q < aligned_end) {
3132 unsigned long data = * (unsigned long *) _q;
3133 if (data & SWAPPED_FAST_CHAR_MASK)
3134 break;
3135 /* Zero upper bytes in UCS-4 builds */
3136#if (Py_UNICODE_SIZE > 2)
3137 _p[0] = 0;
3138 _p[1] = 0;
3139#if (SIZEOF_LONG == 8)
3140 _p[2] = 0;
3141 _p[3] = 0;
3142#endif
3143#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003144 /* Issue #4916; UCS-4 builds on big endian machines must
3145 fill the two last bytes of each 4-byte unit. */
3146#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3147# define OFF 2
3148#else
3149# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003150#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003151 ((unsigned char *) _p)[OFF + 1] = _q[0];
3152 ((unsigned char *) _p)[OFF + 0] = _q[1];
3153 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3154 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3155#if (SIZEOF_LONG == 8)
3156 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3157 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3158 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3159 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3160#endif
3161#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003162 _q += SIZEOF_LONG;
3163 _p += SIZEOF_LONG / 2;
3164 }
3165 }
3166 p = _p;
3167 q = _q;
3168 if (q >= e)
3169 break;
3170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172
Benjamin Peterson14339b62009-01-31 16:36:08 +00003173 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003174
3175 if (ch < 0xD800 || ch > 0xDFFF) {
3176 *p++ = ch;
3177 continue;
3178 }
3179
3180 /* UTF-16 code pair: */
3181 if (q > e) {
3182 errmsg = "unexpected end of data";
3183 startinpos = (((const char *)q) - 2) - starts;
3184 endinpos = ((const char *)e) + 1 - starts;
3185 goto utf16Error;
3186 }
3187 if (0xD800 <= ch && ch <= 0xDBFF) {
3188 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3189 q += 2;
3190 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003191#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 *p++ = ch;
3193 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003194#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003196#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 continue;
3198 }
3199 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003200 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 startinpos = (((const char *)q)-4)-starts;
3202 endinpos = startinpos+2;
3203 goto utf16Error;
3204 }
3205
Benjamin Peterson14339b62009-01-31 16:36:08 +00003206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003207 errmsg = "illegal encoding";
3208 startinpos = (((const char *)q)-2)-starts;
3209 endinpos = startinpos+2;
3210 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003211
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 utf16Error:
3213 outpos = p - PyUnicode_AS_UNICODE(unicode);
3214 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003215 errors,
3216 &errorHandler,
3217 "utf16", errmsg,
3218 &starts,
3219 (const char **)&e,
3220 &startinpos,
3221 &endinpos,
3222 &exc,
3223 (const char **)&q,
3224 &unicode,
3225 &outpos,
3226 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003229 /* remaining byte at the end? (size should be even) */
3230 if (e == q) {
3231 if (!consumed) {
3232 errmsg = "truncated data";
3233 startinpos = ((const char *)q) - starts;
3234 endinpos = ((const char *)e) + 1 - starts;
3235 outpos = p - PyUnicode_AS_UNICODE(unicode);
3236 if (unicode_decode_call_errorhandler(
3237 errors,
3238 &errorHandler,
3239 "utf16", errmsg,
3240 &starts,
3241 (const char **)&e,
3242 &startinpos,
3243 &endinpos,
3244 &exc,
3245 (const char **)&q,
3246 &unicode,
3247 &outpos,
3248 &p))
3249 goto onError;
3250 /* The remaining input chars are ignored if the callback
3251 chooses to skip the input */
3252 }
3253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254
3255 if (byteorder)
3256 *byteorder = bo;
3257
Walter Dörwald69652032004-09-07 20:24:22 +00003258 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003259 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003262 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 goto onError;
3264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 return (PyObject *)unicode;
3268
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return NULL;
3274}
3275
Antoine Pitrouab868312009-01-10 15:40:25 +00003276#undef FAST_CHAR_MASK
3277#undef SWAPPED_FAST_CHAR_MASK
3278
Tim Peters772747b2001-08-09 22:21:55 +00003279PyObject *
3280PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 Py_ssize_t size,
3282 const char *errors,
3283 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003285 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003286 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003287 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003288#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003289 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003290#else
3291 const int pairs = 0;
3292#endif
Tim Peters772747b2001-08-09 22:21:55 +00003293 /* Offsets from p for storing byte pairs in the right order. */
3294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3295 int ihi = 1, ilo = 0;
3296#else
3297 int ihi = 0, ilo = 1;
3298#endif
3299
Benjamin Peterson29060642009-01-31 22:14:21 +00003300#define STORECHAR(CH) \
3301 do { \
3302 p[ihi] = ((CH) >> 8) & 0xff; \
3303 p[ilo] = (CH) & 0xff; \
3304 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003305 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003307#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003308 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 if (s[i] >= 0x10000)
3310 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003311#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003312 /* 2 * (size + pairs + (byteorder == 0)) */
3313 if (size > PY_SSIZE_T_MAX ||
3314 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003315 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003316 nsize = size + pairs + (byteorder == 0);
3317 bytesize = nsize * 2;
3318 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003320 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 if (v == NULL)
3322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003324 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003327 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003328 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003329
3330 if (byteorder == -1) {
3331 /* force LE */
3332 ihi = 1;
3333 ilo = 0;
3334 }
3335 else if (byteorder == 1) {
3336 /* force BE */
3337 ihi = 0;
3338 ilo = 1;
3339 }
3340
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003341 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 Py_UNICODE ch = *s++;
3343 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003344#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 if (ch >= 0x10000) {
3346 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3347 ch = 0xD800 | ((ch-0x10000) >> 10);
3348 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003349#endif
Tim Peters772747b2001-08-09 22:21:55 +00003350 STORECHAR(ch);
3351 if (ch2)
3352 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003353 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003354
3355 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003356 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003357#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358}
3359
3360PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3361{
3362 if (!PyUnicode_Check(unicode)) {
3363 PyErr_BadArgument();
3364 return NULL;
3365 }
3366 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003367 PyUnicode_GET_SIZE(unicode),
3368 NULL,
3369 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370}
3371
3372/* --- Unicode Escape Codec ----------------------------------------------- */
3373
Fredrik Lundh06d12682001-01-24 07:59:11 +00003374static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003375
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 Py_ssize_t size,
3378 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 Py_ssize_t startinpos;
3382 Py_ssize_t endinpos;
3383 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003388 char* message;
3389 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 PyObject *errorHandler = NULL;
3391 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003392
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 /* Escaped strings will always be longer than the resulting
3394 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395 length after conversion to the true value.
3396 (but if the error callback returns a long replacement string
3397 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 v = _PyUnicode_New(size);
3399 if (v == NULL)
3400 goto onError;
3401 if (size == 0)
3402 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003403
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003406
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 while (s < end) {
3408 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003409 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
3412 /* Non-escape characters are interpreted as Unicode ordinals */
3413 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003414 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 continue;
3416 }
3417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 /* \ - Escapes */
3420 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003421 c = *s++;
3422 if (s > end)
3423 c = '\0'; /* Invalid after \ */
3424 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 case '\n': break;
3428 case '\\': *p++ = '\\'; break;
3429 case '\'': *p++ = '\''; break;
3430 case '\"': *p++ = '\"'; break;
3431 case 'b': *p++ = '\b'; break;
3432 case 'f': *p++ = '\014'; break; /* FF */
3433 case 't': *p++ = '\t'; break;
3434 case 'n': *p++ = '\n'; break;
3435 case 'r': *p++ = '\r'; break;
3436 case 'v': *p++ = '\013'; break; /* VT */
3437 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3438
Benjamin Peterson29060642009-01-31 22:14:21 +00003439 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 case '0': case '1': case '2': case '3':
3441 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003442 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003443 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003444 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003445 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003446 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003448 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 break;
3450
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 /* hex escapes */
3452 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003454 digits = 2;
3455 message = "truncated \\xXX escape";
3456 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003460 digits = 4;
3461 message = "truncated \\uXXXX escape";
3462 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003465 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003466 digits = 8;
3467 message = "truncated \\UXXXXXXXX escape";
3468 hexescape:
3469 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 outpos = p-PyUnicode_AS_UNICODE(v);
3471 if (s+digits>end) {
3472 endinpos = size;
3473 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 errors, &errorHandler,
3475 "unicodeescape", "end of string in escape sequence",
3476 &starts, &end, &startinpos, &endinpos, &exc, &s,
3477 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 goto onError;
3479 goto nextByte;
3480 }
3481 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003482 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003483 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 endinpos = (s+i+1)-starts;
3485 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 errors, &errorHandler,
3487 "unicodeescape", message,
3488 &starts, &end, &startinpos, &endinpos, &exc, &s,
3489 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003490 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003492 }
3493 chr = (chr<<4) & ~0xF;
3494 if (c >= '0' && c <= '9')
3495 chr += c - '0';
3496 else if (c >= 'a' && c <= 'f')
3497 chr += 10 + c - 'a';
3498 else
3499 chr += 10 + c - 'A';
3500 }
3501 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003502 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 /* _decoding_error will have already written into the
3504 target buffer. */
3505 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003506 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003507 /* when we get here, chr is a 32-bit unicode character */
3508 if (chr <= 0xffff)
3509 /* UCS-2 character */
3510 *p++ = (Py_UNICODE) chr;
3511 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003512 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003513 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003514#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003515 *p++ = chr;
3516#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003517 chr -= 0x10000L;
3518 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003519 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003520#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003521 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 endinpos = s-starts;
3523 outpos = p-PyUnicode_AS_UNICODE(v);
3524 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 errors, &errorHandler,
3526 "unicodeescape", "illegal Unicode character",
3527 &starts, &end, &startinpos, &endinpos, &exc, &s,
3528 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003529 goto onError;
3530 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003531 break;
3532
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003534 case 'N':
3535 message = "malformed \\N character escape";
3536 if (ucnhash_CAPI == NULL) {
3537 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003538 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003539 if (ucnhash_CAPI == NULL)
3540 goto ucnhashError;
3541 }
3542 if (*s == '{') {
3543 const char *start = s+1;
3544 /* look for the closing brace */
3545 while (*s != '}' && s < end)
3546 s++;
3547 if (s > start && s < end && *s == '}') {
3548 /* found a name. look it up in the unicode database */
3549 message = "unknown Unicode character name";
3550 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003551 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003552 goto store;
3553 }
3554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 endinpos = s-starts;
3556 outpos = p-PyUnicode_AS_UNICODE(v);
3557 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 errors, &errorHandler,
3559 "unicodeescape", message,
3560 &starts, &end, &startinpos, &endinpos, &exc, &s,
3561 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003562 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003563 break;
3564
3565 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003566 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 message = "\\ at end of string";
3568 s--;
3569 endinpos = s-starts;
3570 outpos = p-PyUnicode_AS_UNICODE(v);
3571 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 errors, &errorHandler,
3573 "unicodeescape", message,
3574 &starts, &end, &startinpos, &endinpos, &exc, &s,
3575 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003576 goto onError;
3577 }
3578 else {
3579 *p++ = '\\';
3580 *p++ = (unsigned char)s[-1];
3581 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003582 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003587 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003589 Py_XDECREF(errorHandler);
3590 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003592
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003594 PyErr_SetString(
3595 PyExc_UnicodeError,
3596 "\\N escapes not supported (can't load unicodedata module)"
3597 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003598 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 Py_XDECREF(errorHandler);
3600 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003601 return NULL;
3602
Benjamin Peterson29060642009-01-31 22:14:21 +00003603 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 return NULL;
3608}
3609
3610/* Return a Unicode-Escape string version of the Unicode object.
3611
3612 If quotes is true, the string is enclosed in u"" or u'' quotes as
3613 appropriate.
3614
3615*/
3616
Thomas Wouters477c8d52006-05-27 19:21:47 +00003617Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 Py_ssize_t size,
3619 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003620{
3621 /* like wcschr, but doesn't stop at NULL characters */
3622
3623 while (size-- > 0) {
3624 if (*s == ch)
3625 return s;
3626 s++;
3627 }
3628
3629 return NULL;
3630}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003631
Walter Dörwald79e913e2007-05-12 11:08:06 +00003632static const char *hexdigits = "0123456789abcdef";
3633
3634PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003637 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003640#ifdef Py_UNICODE_WIDE
3641 const Py_ssize_t expandsize = 10;
3642#else
3643 const Py_ssize_t expandsize = 6;
3644#endif
3645
Thomas Wouters89f507f2006-12-13 04:49:30 +00003646 /* XXX(nnorwitz): rather than over-allocating, it would be
3647 better to choose a different scheme. Perhaps scan the
3648 first N-chars of the string and allocate based on that size.
3649 */
3650 /* Initial allocation is based on the longest-possible unichr
3651 escape.
3652
3653 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3654 unichr, so in this case it's the longest unichr escape. In
3655 narrow (UTF-16) builds this is five chars per source unichr
3656 since there are two unichrs in the surrogate pair, so in narrow
3657 (UTF-16) builds it's not the longest unichr escape.
3658
3659 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3660 so in the narrow (UTF-16) build case it's the longest unichr
3661 escape.
3662 */
3663
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 if (size == 0)
3665 return PyBytes_FromStringAndSize(NULL, 0);
3666
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003667 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003669
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003670 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 2
3672 + expandsize*size
3673 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 if (repr == NULL)
3675 return NULL;
3676
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003677 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 while (size-- > 0) {
3680 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003681
Walter Dörwald79e913e2007-05-12 11:08:06 +00003682 /* Escape backslashes */
3683 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 *p++ = '\\';
3685 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003686 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003687 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003688
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003689#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003690 /* Map 21-bit characters to '\U00xxxxxx' */
3691 else if (ch >= 0x10000) {
3692 *p++ = '\\';
3693 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003694 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3695 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3696 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3697 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3698 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3699 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3700 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3701 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003703 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003704#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3706 else if (ch >= 0xD800 && ch < 0xDC00) {
3707 Py_UNICODE ch2;
3708 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 ch2 = *s++;
3711 size--;
3712 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3713 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3714 *p++ = '\\';
3715 *p++ = 'U';
3716 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3717 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3718 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3719 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3720 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3721 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3722 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3723 *p++ = hexdigits[ucs & 0x0000000F];
3724 continue;
3725 }
3726 /* Fall through: isolated surrogates are copied as-is */
3727 s--;
3728 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003729 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003730#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003731
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003733 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 *p++ = '\\';
3735 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003736 *p++ = hexdigits[(ch >> 12) & 0x000F];
3737 *p++ = hexdigits[(ch >> 8) & 0x000F];
3738 *p++ = hexdigits[(ch >> 4) & 0x000F];
3739 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003741
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003742 /* Map special whitespace to '\t', \n', '\r' */
3743 else if (ch == '\t') {
3744 *p++ = '\\';
3745 *p++ = 't';
3746 }
3747 else if (ch == '\n') {
3748 *p++ = '\\';
3749 *p++ = 'n';
3750 }
3751 else if (ch == '\r') {
3752 *p++ = '\\';
3753 *p++ = 'r';
3754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003755
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003756 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003757 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003759 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003760 *p++ = hexdigits[(ch >> 4) & 0x000F];
3761 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003762 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 /* Copy everything else as-is */
3765 else
3766 *p++ = (char) ch;
3767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003769 assert(p - PyBytes_AS_STRING(repr) > 0);
3770 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3771 return NULL;
3772 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773}
3774
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003775PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003777 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (!PyUnicode_Check(unicode)) {
3779 PyErr_BadArgument();
3780 return NULL;
3781 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003782 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3783 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003784 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785}
3786
3787/* --- Raw Unicode Escape Codec ------------------------------------------- */
3788
3789PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 Py_ssize_t size,
3791 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003794 Py_ssize_t startinpos;
3795 Py_ssize_t endinpos;
3796 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 const char *end;
3800 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 PyObject *errorHandler = NULL;
3802 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 /* Escaped strings will always be longer than the resulting
3805 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 length after conversion to the true value. (But decoding error
3807 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 v = _PyUnicode_New(size);
3809 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 end = s + size;
3815 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 unsigned char c;
3817 Py_UCS4 x;
3818 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003819 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 /* Non-escape characters are interpreted as Unicode ordinals */
3822 if (*s != '\\') {
3823 *p++ = (unsigned char)*s++;
3824 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003825 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 startinpos = s-starts;
3827
3828 /* \u-escapes are only interpreted iff the number of leading
3829 backslashes if odd */
3830 bs = s;
3831 for (;s < end;) {
3832 if (*s != '\\')
3833 break;
3834 *p++ = (unsigned char)*s++;
3835 }
3836 if (((s - bs) & 1) == 0 ||
3837 s >= end ||
3838 (*s != 'u' && *s != 'U')) {
3839 continue;
3840 }
3841 p--;
3842 count = *s=='u' ? 4 : 8;
3843 s++;
3844
3845 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3846 outpos = p-PyUnicode_AS_UNICODE(v);
3847 for (x = 0, i = 0; i < count; ++i, ++s) {
3848 c = (unsigned char)*s;
3849 if (!ISXDIGIT(c)) {
3850 endinpos = s-starts;
3851 if (unicode_decode_call_errorhandler(
3852 errors, &errorHandler,
3853 "rawunicodeescape", "truncated \\uXXXX",
3854 &starts, &end, &startinpos, &endinpos, &exc, &s,
3855 &v, &outpos, &p))
3856 goto onError;
3857 goto nextByte;
3858 }
3859 x = (x<<4) & ~0xF;
3860 if (c >= '0' && c <= '9')
3861 x += c - '0';
3862 else if (c >= 'a' && c <= 'f')
3863 x += 10 + c - 'a';
3864 else
3865 x += 10 + c - 'A';
3866 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003867 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 /* UCS-2 character */
3869 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003870 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 /* UCS-4 character. Either store directly, or as
3872 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003873#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003875#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 x -= 0x10000L;
3877 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3878 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003879#endif
3880 } else {
3881 endinpos = s-starts;
3882 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003883 if (unicode_decode_call_errorhandler(
3884 errors, &errorHandler,
3885 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 &starts, &end, &startinpos, &endinpos, &exc, &s,
3887 &v, &outpos, &p))
3888 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003889 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 nextByte:
3891 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003893 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 Py_XDECREF(errorHandler);
3896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003898
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 Py_XDECREF(errorHandler);
3902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 return NULL;
3904}
3905
3906PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003907 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003909 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 char *p;
3911 char *q;
3912
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003913#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003914 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003915#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003916 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003917#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003918
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003919 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003921
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003922 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 if (repr == NULL)
3924 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003925 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003926 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003928 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 while (size-- > 0) {
3930 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003931#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 /* Map 32-bit characters to '\Uxxxxxxxx' */
3933 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003934 *p++ = '\\';
3935 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003936 *p++ = hexdigits[(ch >> 28) & 0xf];
3937 *p++ = hexdigits[(ch >> 24) & 0xf];
3938 *p++ = hexdigits[(ch >> 20) & 0xf];
3939 *p++ = hexdigits[(ch >> 16) & 0xf];
3940 *p++ = hexdigits[(ch >> 12) & 0xf];
3941 *p++ = hexdigits[(ch >> 8) & 0xf];
3942 *p++ = hexdigits[(ch >> 4) & 0xf];
3943 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003944 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003945 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003946#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003947 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3948 if (ch >= 0xD800 && ch < 0xDC00) {
3949 Py_UNICODE ch2;
3950 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003951
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 ch2 = *s++;
3953 size--;
3954 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3955 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3956 *p++ = '\\';
3957 *p++ = 'U';
3958 *p++ = hexdigits[(ucs >> 28) & 0xf];
3959 *p++ = hexdigits[(ucs >> 24) & 0xf];
3960 *p++ = hexdigits[(ucs >> 20) & 0xf];
3961 *p++ = hexdigits[(ucs >> 16) & 0xf];
3962 *p++ = hexdigits[(ucs >> 12) & 0xf];
3963 *p++ = hexdigits[(ucs >> 8) & 0xf];
3964 *p++ = hexdigits[(ucs >> 4) & 0xf];
3965 *p++ = hexdigits[ucs & 0xf];
3966 continue;
3967 }
3968 /* Fall through: isolated surrogates are copied as-is */
3969 s--;
3970 size++;
3971 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003972#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 /* Map 16-bit characters to '\uxxxx' */
3974 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 *p++ = '\\';
3976 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003977 *p++ = hexdigits[(ch >> 12) & 0xf];
3978 *p++ = hexdigits[(ch >> 8) & 0xf];
3979 *p++ = hexdigits[(ch >> 4) & 0xf];
3980 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 /* Copy everything else as-is */
3983 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 *p++ = (char) ch;
3985 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003986 size = p - q;
3987
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003988 assert(size > 0);
3989 if (_PyBytes_Resize(&repr, size) < 0)
3990 return NULL;
3991 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992}
3993
3994PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3995{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003996 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003998 PyErr_BadArgument();
3999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004001 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4002 PyUnicode_GET_SIZE(unicode));
4003
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004004 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005}
4006
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004007/* --- Unicode Internal Codec ------------------------------------------- */
4008
4009PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 Py_ssize_t size,
4011 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004012{
4013 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004014 Py_ssize_t startinpos;
4015 Py_ssize_t endinpos;
4016 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004017 PyUnicodeObject *v;
4018 Py_UNICODE *p;
4019 const char *end;
4020 const char *reason;
4021 PyObject *errorHandler = NULL;
4022 PyObject *exc = NULL;
4023
Neal Norwitzd43069c2006-01-08 01:12:10 +00004024#ifdef Py_UNICODE_WIDE
4025 Py_UNICODE unimax = PyUnicode_GetMax();
4026#endif
4027
Thomas Wouters89f507f2006-12-13 04:49:30 +00004028 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004029 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4030 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004032 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004033 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004034 p = PyUnicode_AS_UNICODE(v);
4035 end = s + size;
4036
4037 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004038 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004039 /* We have to sanity check the raw data, otherwise doom looms for
4040 some malformed UCS-4 data. */
4041 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004042#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004043 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004044#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004045 end-s < Py_UNICODE_SIZE
4046 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004048 startinpos = s - starts;
4049 if (end-s < Py_UNICODE_SIZE) {
4050 endinpos = end-starts;
4051 reason = "truncated input";
4052 }
4053 else {
4054 endinpos = s - starts + Py_UNICODE_SIZE;
4055 reason = "illegal code point (> 0x10FFFF)";
4056 }
4057 outpos = p - PyUnicode_AS_UNICODE(v);
4058 if (unicode_decode_call_errorhandler(
4059 errors, &errorHandler,
4060 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004061 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004062 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004063 goto onError;
4064 }
4065 }
4066 else {
4067 p++;
4068 s += Py_UNICODE_SIZE;
4069 }
4070 }
4071
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004072 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004073 goto onError;
4074 Py_XDECREF(errorHandler);
4075 Py_XDECREF(exc);
4076 return (PyObject *)v;
4077
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004079 Py_XDECREF(v);
4080 Py_XDECREF(errorHandler);
4081 Py_XDECREF(exc);
4082 return NULL;
4083}
4084
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085/* --- Latin-1 Codec ------------------------------------------------------ */
4086
4087PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 Py_ssize_t size,
4089 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 PyUnicodeObject *v;
4092 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004093 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004094
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004096 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 Py_UNICODE r = *(unsigned char*)s;
4098 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004099 }
4100
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 v = _PyUnicode_New(size);
4102 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004107 e = s + size;
4108 /* Unrolling the copy makes it much faster by reducing the looping
4109 overhead. This is similar to what many memcpy() implementations do. */
4110 unrolled_end = e - 4;
4111 while (s < unrolled_end) {
4112 p[0] = (unsigned char) s[0];
4113 p[1] = (unsigned char) s[1];
4114 p[2] = (unsigned char) s[2];
4115 p[3] = (unsigned char) s[3];
4116 s += 4;
4117 p += 4;
4118 }
4119 while (s < e)
4120 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004122
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 Py_XDECREF(v);
4125 return NULL;
4126}
4127
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128/* create or adjust a UnicodeEncodeError */
4129static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 const char *encoding,
4131 const Py_UNICODE *unicode, Py_ssize_t size,
4132 Py_ssize_t startpos, Py_ssize_t endpos,
4133 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 *exceptionObject = PyUnicodeEncodeError_Create(
4137 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
4139 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4141 goto onError;
4142 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4143 goto onError;
4144 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4145 goto onError;
4146 return;
4147 onError:
4148 Py_DECREF(*exceptionObject);
4149 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 }
4151}
4152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153/* raises a UnicodeEncodeError */
4154static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 const char *encoding,
4156 const Py_UNICODE *unicode, Py_ssize_t size,
4157 Py_ssize_t startpos, Py_ssize_t endpos,
4158 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159{
4160 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164}
4165
4166/* error handling callback helper:
4167 build arguments, call the callback and check the arguments,
4168 put the result into newpos and return the replacement string, which
4169 has to be freed by the caller */
4170static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 PyObject **errorHandler,
4172 const char *encoding, const char *reason,
4173 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4174 Py_ssize_t startpos, Py_ssize_t endpos,
4175 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004177 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178
4179 PyObject *restuple;
4180 PyObject *resunicode;
4181
4182 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 }
4187
4188 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192
4193 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004198 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 Py_DECREF(restuple);
4200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004202 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 &resunicode, newpos)) {
4204 Py_DECREF(restuple);
4205 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004207 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4208 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4209 Py_DECREF(restuple);
4210 return NULL;
4211 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004214 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4216 Py_DECREF(restuple);
4217 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004218 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 Py_INCREF(resunicode);
4220 Py_DECREF(restuple);
4221 return resunicode;
4222}
4223
4224static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 Py_ssize_t size,
4226 const char *errors,
4227 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228{
4229 /* output object */
4230 PyObject *res;
4231 /* pointers to the beginning and end+1 of input */
4232 const Py_UNICODE *startp = p;
4233 const Py_UNICODE *endp = p + size;
4234 /* pointer to the beginning of the unencodable characters */
4235 /* const Py_UNICODE *badp = NULL; */
4236 /* pointer into the output */
4237 char *str;
4238 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004239 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004240 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4241 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 PyObject *errorHandler = NULL;
4243 PyObject *exc = NULL;
4244 /* the following variable is used for caching string comparisons
4245 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4246 int known_errorHandler = -1;
4247
4248 /* allocate enough for a simple encoding without
4249 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004250 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004251 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004252 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004254 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004255 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 ressize = size;
4257
4258 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 /* can we encode this? */
4262 if (c<limit) {
4263 /* no overflow check, because we know that the space is enough */
4264 *str++ = (char)c;
4265 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004266 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 else {
4268 Py_ssize_t unicodepos = p-startp;
4269 Py_ssize_t requiredsize;
4270 PyObject *repunicode;
4271 Py_ssize_t repsize;
4272 Py_ssize_t newpos;
4273 Py_ssize_t respos;
4274 Py_UNICODE *uni2;
4275 /* startpos for collecting unencodable chars */
4276 const Py_UNICODE *collstart = p;
4277 const Py_UNICODE *collend = p;
4278 /* find all unecodable characters */
4279 while ((collend < endp) && ((*collend)>=limit))
4280 ++collend;
4281 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4282 if (known_errorHandler==-1) {
4283 if ((errors==NULL) || (!strcmp(errors, "strict")))
4284 known_errorHandler = 1;
4285 else if (!strcmp(errors, "replace"))
4286 known_errorHandler = 2;
4287 else if (!strcmp(errors, "ignore"))
4288 known_errorHandler = 3;
4289 else if (!strcmp(errors, "xmlcharrefreplace"))
4290 known_errorHandler = 4;
4291 else
4292 known_errorHandler = 0;
4293 }
4294 switch (known_errorHandler) {
4295 case 1: /* strict */
4296 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4297 goto onError;
4298 case 2: /* replace */
4299 while (collstart++<collend)
4300 *str++ = '?'; /* fall through */
4301 case 3: /* ignore */
4302 p = collend;
4303 break;
4304 case 4: /* xmlcharrefreplace */
4305 respos = str - PyBytes_AS_STRING(res);
4306 /* determine replacement size (temporarily (mis)uses p) */
4307 for (p = collstart, repsize = 0; p < collend; ++p) {
4308 if (*p<10)
4309 repsize += 2+1+1;
4310 else if (*p<100)
4311 repsize += 2+2+1;
4312 else if (*p<1000)
4313 repsize += 2+3+1;
4314 else if (*p<10000)
4315 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004316#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 else
4318 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004319#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 else if (*p<100000)
4321 repsize += 2+5+1;
4322 else if (*p<1000000)
4323 repsize += 2+6+1;
4324 else
4325 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004326#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 }
4328 requiredsize = respos+repsize+(endp-collend);
4329 if (requiredsize > ressize) {
4330 if (requiredsize<2*ressize)
4331 requiredsize = 2*ressize;
4332 if (_PyBytes_Resize(&res, requiredsize))
4333 goto onError;
4334 str = PyBytes_AS_STRING(res) + respos;
4335 ressize = requiredsize;
4336 }
4337 /* generate replacement (temporarily (mis)uses p) */
4338 for (p = collstart; p < collend; ++p) {
4339 str += sprintf(str, "&#%d;", (int)*p);
4340 }
4341 p = collend;
4342 break;
4343 default:
4344 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4345 encoding, reason, startp, size, &exc,
4346 collstart-startp, collend-startp, &newpos);
4347 if (repunicode == NULL)
4348 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004349 if (PyBytes_Check(repunicode)) {
4350 /* Directly copy bytes result to output. */
4351 repsize = PyBytes_Size(repunicode);
4352 if (repsize > 1) {
4353 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004354 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004355 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4356 Py_DECREF(repunicode);
4357 goto onError;
4358 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004359 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004360 ressize += repsize-1;
4361 }
4362 memcpy(str, PyBytes_AsString(repunicode), repsize);
4363 str += repsize;
4364 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004365 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004366 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004367 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 /* need more space? (at least enough for what we
4369 have+the replacement+the rest of the string, so
4370 we won't have to check space for encodable characters) */
4371 respos = str - PyBytes_AS_STRING(res);
4372 repsize = PyUnicode_GET_SIZE(repunicode);
4373 requiredsize = respos+repsize+(endp-collend);
4374 if (requiredsize > ressize) {
4375 if (requiredsize<2*ressize)
4376 requiredsize = 2*ressize;
4377 if (_PyBytes_Resize(&res, requiredsize)) {
4378 Py_DECREF(repunicode);
4379 goto onError;
4380 }
4381 str = PyBytes_AS_STRING(res) + respos;
4382 ressize = requiredsize;
4383 }
4384 /* check if there is anything unencodable in the replacement
4385 and copy it to the output */
4386 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4387 c = *uni2;
4388 if (c >= limit) {
4389 raise_encode_exception(&exc, encoding, startp, size,
4390 unicodepos, unicodepos+1, reason);
4391 Py_DECREF(repunicode);
4392 goto onError;
4393 }
4394 *str = (char)c;
4395 }
4396 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004397 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004399 }
4400 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004401 /* Resize if we allocated to much */
4402 size = str - PyBytes_AS_STRING(res);
4403 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004404 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004405 if (_PyBytes_Resize(&res, size) < 0)
4406 goto onError;
4407 }
4408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 Py_XDECREF(errorHandler);
4410 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004411 return res;
4412
4413 onError:
4414 Py_XDECREF(res);
4415 Py_XDECREF(errorHandler);
4416 Py_XDECREF(exc);
4417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418}
4419
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 Py_ssize_t size,
4422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425}
4426
4427PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4428{
4429 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 PyErr_BadArgument();
4431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 }
4433 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 PyUnicode_GET_SIZE(unicode),
4435 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436}
4437
4438/* --- 7-bit ASCII Codec -------------------------------------------------- */
4439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 Py_ssize_t size,
4442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 PyUnicodeObject *v;
4446 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t startinpos;
4448 Py_ssize_t endinpos;
4449 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 const char *e;
4451 PyObject *errorHandler = NULL;
4452 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004453
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004455 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 Py_UNICODE r = *(unsigned char*)s;
4457 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004458 }
Tim Petersced69f82003-09-16 20:30:58 +00004459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 v = _PyUnicode_New(size);
4461 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 e = s + size;
4467 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 register unsigned char c = (unsigned char)*s;
4469 if (c < 128) {
4470 *p++ = c;
4471 ++s;
4472 }
4473 else {
4474 startinpos = s-starts;
4475 endinpos = startinpos + 1;
4476 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4477 if (unicode_decode_call_errorhandler(
4478 errors, &errorHandler,
4479 "ascii", "ordinal not in range(128)",
4480 &starts, &e, &startinpos, &endinpos, &exc, &s,
4481 &v, &outpos, &p))
4482 goto onError;
4483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004485 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4487 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 Py_XDECREF(errorHandler);
4489 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004491
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(errorHandler);
4495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 return NULL;
4497}
4498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 Py_ssize_t size,
4501 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504}
4505
4506PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4507{
4508 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 PyErr_BadArgument();
4510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 }
4512 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 PyUnicode_GET_SIZE(unicode),
4514 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515}
4516
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004517#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004518
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004519/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004520
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004521#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004522#define NEED_RETRY
4523#endif
4524
4525/* XXX This code is limited to "true" double-byte encodings, as
4526 a) it assumes an incomplete character consists of a single byte, and
4527 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004529
4530static int is_dbcs_lead_byte(const char *s, int offset)
4531{
4532 const char *curr = s + offset;
4533
4534 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 const char *prev = CharPrev(s, curr);
4536 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004537 }
4538 return 0;
4539}
4540
4541/*
4542 * Decode MBCS string into unicode object. If 'final' is set, converts
4543 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4544 */
4545static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 const char *s, /* MBCS string */
4547 int size, /* sizeof MBCS string */
4548 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549{
4550 Py_UNICODE *p;
4551 Py_ssize_t n = 0;
4552 int usize = 0;
4553
4554 assert(size >= 0);
4555
4556 /* Skip trailing lead-byte unless 'final' is set */
4557 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004559
4560 /* First get the size of the result */
4561 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4563 if (usize == 0) {
4564 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4565 return -1;
4566 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004567 }
4568
4569 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 /* Create unicode object */
4571 *v = _PyUnicode_New(usize);
4572 if (*v == NULL)
4573 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004574 }
4575 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 /* Extend unicode object */
4577 n = PyUnicode_GET_SIZE(*v);
4578 if (_PyUnicode_Resize(v, n + usize) < 0)
4579 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580 }
4581
4582 /* Do the conversion */
4583 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 p = PyUnicode_AS_UNICODE(*v) + n;
4585 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4586 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4587 return -1;
4588 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004589 }
4590
4591 return size;
4592}
4593
4594PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 Py_ssize_t size,
4596 const char *errors,
4597 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004598{
4599 PyUnicodeObject *v = NULL;
4600 int done;
4601
4602 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004604
4605#ifdef NEED_RETRY
4606 retry:
4607 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004609 else
4610#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612
4613 if (done < 0) {
4614 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616 }
4617
4618 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620
4621#ifdef NEED_RETRY
4622 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 s += done;
4624 size -= done;
4625 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004626 }
4627#endif
4628
4629 return (PyObject *)v;
4630}
4631
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004632PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 Py_ssize_t size,
4634 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004635{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004636 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4637}
4638
4639/*
4640 * Convert unicode into string object (MBCS).
4641 * Returns 0 if succeed, -1 otherwise.
4642 */
4643static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 const Py_UNICODE *p, /* unicode */
4645 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004646{
4647 int mbcssize = 0;
4648 Py_ssize_t n = 0;
4649
4650 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004651
4652 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4655 if (mbcssize == 0) {
4656 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4657 return -1;
4658 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004659 }
4660
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 /* Create string object */
4663 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4664 if (*repr == NULL)
4665 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004666 }
4667 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 /* Extend string object */
4669 n = PyBytes_Size(*repr);
4670 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4671 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672 }
4673
4674 /* Do the conversion */
4675 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 char *s = PyBytes_AS_STRING(*repr) + n;
4677 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4678 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4679 return -1;
4680 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004681 }
4682
4683 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004684}
4685
4686PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 Py_ssize_t size,
4688 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004689{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004690 PyObject *repr = NULL;
4691 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004693#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004695 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697 else
4698#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004700
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004701 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 Py_XDECREF(repr);
4703 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004704 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004705
4706#ifdef NEED_RETRY
4707 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 p += INT_MAX;
4709 size -= INT_MAX;
4710 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004711 }
4712#endif
4713
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004714 return repr;
4715}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004716
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004717PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4718{
4719 if (!PyUnicode_Check(unicode)) {
4720 PyErr_BadArgument();
4721 return NULL;
4722 }
4723 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 PyUnicode_GET_SIZE(unicode),
4725 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004726}
4727
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004728#undef NEED_RETRY
4729
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004730#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732/* --- Character Mapping Codec -------------------------------------------- */
4733
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 Py_ssize_t size,
4736 PyObject *mapping,
4737 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004740 Py_ssize_t startinpos;
4741 Py_ssize_t endinpos;
4742 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 PyUnicodeObject *v;
4745 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 PyObject *errorHandler = NULL;
4748 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004749 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004751
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 /* Default to Latin-1 */
4753 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004754 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755
4756 v = _PyUnicode_New(size);
4757 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004763 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 mapstring = PyUnicode_AS_UNICODE(mapping);
4765 maplen = PyUnicode_GET_SIZE(mapping);
4766 while (s < e) {
4767 unsigned char ch = *s;
4768 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 if (ch < maplen)
4771 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 if (x == 0xfffe) {
4774 /* undefined mapping */
4775 outpos = p-PyUnicode_AS_UNICODE(v);
4776 startinpos = s-starts;
4777 endinpos = startinpos+1;
4778 if (unicode_decode_call_errorhandler(
4779 errors, &errorHandler,
4780 "charmap", "character maps to <undefined>",
4781 &starts, &e, &startinpos, &endinpos, &exc, &s,
4782 &v, &outpos, &p)) {
4783 goto onError;
4784 }
4785 continue;
4786 }
4787 *p++ = x;
4788 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004789 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004790 }
4791 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 while (s < e) {
4793 unsigned char ch = *s;
4794 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004795
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4797 w = PyLong_FromLong((long)ch);
4798 if (w == NULL)
4799 goto onError;
4800 x = PyObject_GetItem(mapping, w);
4801 Py_DECREF(w);
4802 if (x == NULL) {
4803 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4804 /* No mapping found means: mapping is undefined. */
4805 PyErr_Clear();
4806 x = Py_None;
4807 Py_INCREF(x);
4808 } else
4809 goto onError;
4810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004811
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 /* Apply mapping */
4813 if (PyLong_Check(x)) {
4814 long value = PyLong_AS_LONG(x);
4815 if (value < 0 || value > 65535) {
4816 PyErr_SetString(PyExc_TypeError,
4817 "character mapping must be in range(65536)");
4818 Py_DECREF(x);
4819 goto onError;
4820 }
4821 *p++ = (Py_UNICODE)value;
4822 }
4823 else if (x == Py_None) {
4824 /* undefined mapping */
4825 outpos = p-PyUnicode_AS_UNICODE(v);
4826 startinpos = s-starts;
4827 endinpos = startinpos+1;
4828 if (unicode_decode_call_errorhandler(
4829 errors, &errorHandler,
4830 "charmap", "character maps to <undefined>",
4831 &starts, &e, &startinpos, &endinpos, &exc, &s,
4832 &v, &outpos, &p)) {
4833 Py_DECREF(x);
4834 goto onError;
4835 }
4836 Py_DECREF(x);
4837 continue;
4838 }
4839 else if (PyUnicode_Check(x)) {
4840 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004841
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 if (targetsize == 1)
4843 /* 1-1 mapping */
4844 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004845
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 else if (targetsize > 1) {
4847 /* 1-n mapping */
4848 if (targetsize > extrachars) {
4849 /* resize first */
4850 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4851 Py_ssize_t needed = (targetsize - extrachars) + \
4852 (targetsize << 2);
4853 extrachars += needed;
4854 /* XXX overflow detection missing */
4855 if (_PyUnicode_Resize(&v,
4856 PyUnicode_GET_SIZE(v) + needed) < 0) {
4857 Py_DECREF(x);
4858 goto onError;
4859 }
4860 p = PyUnicode_AS_UNICODE(v) + oldpos;
4861 }
4862 Py_UNICODE_COPY(p,
4863 PyUnicode_AS_UNICODE(x),
4864 targetsize);
4865 p += targetsize;
4866 extrachars -= targetsize;
4867 }
4868 /* 1-0 mapping: skip the character */
4869 }
4870 else {
4871 /* wrong return value */
4872 PyErr_SetString(PyExc_TypeError,
4873 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004874 Py_DECREF(x);
4875 goto onError;
4876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 Py_DECREF(x);
4878 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 }
4881 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4883 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004887
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 Py_XDECREF(errorHandler);
4890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 Py_XDECREF(v);
4892 return NULL;
4893}
4894
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004895/* Charmap encoding: the lookup table */
4896
4897struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 PyObject_HEAD
4899 unsigned char level1[32];
4900 int count2, count3;
4901 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004902};
4903
4904static PyObject*
4905encoding_map_size(PyObject *obj, PyObject* args)
4906{
4907 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004908 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004910}
4911
4912static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004913 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 PyDoc_STR("Return the size (in bytes) of this object") },
4915 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004916};
4917
4918static void
4919encoding_map_dealloc(PyObject* o)
4920{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004921 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004922}
4923
4924static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004925 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 "EncodingMap", /*tp_name*/
4927 sizeof(struct encoding_map), /*tp_basicsize*/
4928 0, /*tp_itemsize*/
4929 /* methods */
4930 encoding_map_dealloc, /*tp_dealloc*/
4931 0, /*tp_print*/
4932 0, /*tp_getattr*/
4933 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004934 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 0, /*tp_repr*/
4936 0, /*tp_as_number*/
4937 0, /*tp_as_sequence*/
4938 0, /*tp_as_mapping*/
4939 0, /*tp_hash*/
4940 0, /*tp_call*/
4941 0, /*tp_str*/
4942 0, /*tp_getattro*/
4943 0, /*tp_setattro*/
4944 0, /*tp_as_buffer*/
4945 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4946 0, /*tp_doc*/
4947 0, /*tp_traverse*/
4948 0, /*tp_clear*/
4949 0, /*tp_richcompare*/
4950 0, /*tp_weaklistoffset*/
4951 0, /*tp_iter*/
4952 0, /*tp_iternext*/
4953 encoding_map_methods, /*tp_methods*/
4954 0, /*tp_members*/
4955 0, /*tp_getset*/
4956 0, /*tp_base*/
4957 0, /*tp_dict*/
4958 0, /*tp_descr_get*/
4959 0, /*tp_descr_set*/
4960 0, /*tp_dictoffset*/
4961 0, /*tp_init*/
4962 0, /*tp_alloc*/
4963 0, /*tp_new*/
4964 0, /*tp_free*/
4965 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004966};
4967
4968PyObject*
4969PyUnicode_BuildEncodingMap(PyObject* string)
4970{
4971 Py_UNICODE *decode;
4972 PyObject *result;
4973 struct encoding_map *mresult;
4974 int i;
4975 int need_dict = 0;
4976 unsigned char level1[32];
4977 unsigned char level2[512];
4978 unsigned char *mlevel1, *mlevel2, *mlevel3;
4979 int count2 = 0, count3 = 0;
4980
4981 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4982 PyErr_BadArgument();
4983 return NULL;
4984 }
4985 decode = PyUnicode_AS_UNICODE(string);
4986 memset(level1, 0xFF, sizeof level1);
4987 memset(level2, 0xFF, sizeof level2);
4988
4989 /* If there isn't a one-to-one mapping of NULL to \0,
4990 or if there are non-BMP characters, we need to use
4991 a mapping dictionary. */
4992 if (decode[0] != 0)
4993 need_dict = 1;
4994 for (i = 1; i < 256; i++) {
4995 int l1, l2;
4996 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004997#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004998 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004999#endif
5000 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005001 need_dict = 1;
5002 break;
5003 }
5004 if (decode[i] == 0xFFFE)
5005 /* unmapped character */
5006 continue;
5007 l1 = decode[i] >> 11;
5008 l2 = decode[i] >> 7;
5009 if (level1[l1] == 0xFF)
5010 level1[l1] = count2++;
5011 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005012 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005013 }
5014
5015 if (count2 >= 0xFF || count3 >= 0xFF)
5016 need_dict = 1;
5017
5018 if (need_dict) {
5019 PyObject *result = PyDict_New();
5020 PyObject *key, *value;
5021 if (!result)
5022 return NULL;
5023 for (i = 0; i < 256; i++) {
5024 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005025 key = PyLong_FromLong(decode[i]);
5026 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005027 if (!key || !value)
5028 goto failed1;
5029 if (PyDict_SetItem(result, key, value) == -1)
5030 goto failed1;
5031 Py_DECREF(key);
5032 Py_DECREF(value);
5033 }
5034 return result;
5035 failed1:
5036 Py_XDECREF(key);
5037 Py_XDECREF(value);
5038 Py_DECREF(result);
5039 return NULL;
5040 }
5041
5042 /* Create a three-level trie */
5043 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5044 16*count2 + 128*count3 - 1);
5045 if (!result)
5046 return PyErr_NoMemory();
5047 PyObject_Init(result, &EncodingMapType);
5048 mresult = (struct encoding_map*)result;
5049 mresult->count2 = count2;
5050 mresult->count3 = count3;
5051 mlevel1 = mresult->level1;
5052 mlevel2 = mresult->level23;
5053 mlevel3 = mresult->level23 + 16*count2;
5054 memcpy(mlevel1, level1, 32);
5055 memset(mlevel2, 0xFF, 16*count2);
5056 memset(mlevel3, 0, 128*count3);
5057 count3 = 0;
5058 for (i = 1; i < 256; i++) {
5059 int o1, o2, o3, i2, i3;
5060 if (decode[i] == 0xFFFE)
5061 /* unmapped character */
5062 continue;
5063 o1 = decode[i]>>11;
5064 o2 = (decode[i]>>7) & 0xF;
5065 i2 = 16*mlevel1[o1] + o2;
5066 if (mlevel2[i2] == 0xFF)
5067 mlevel2[i2] = count3++;
5068 o3 = decode[i] & 0x7F;
5069 i3 = 128*mlevel2[i2] + o3;
5070 mlevel3[i3] = i;
5071 }
5072 return result;
5073}
5074
5075static int
5076encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5077{
5078 struct encoding_map *map = (struct encoding_map*)mapping;
5079 int l1 = c>>11;
5080 int l2 = (c>>7) & 0xF;
5081 int l3 = c & 0x7F;
5082 int i;
5083
5084#ifdef Py_UNICODE_WIDE
5085 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005087 }
5088#endif
5089 if (c == 0)
5090 return 0;
5091 /* level 1*/
5092 i = map->level1[l1];
5093 if (i == 0xFF) {
5094 return -1;
5095 }
5096 /* level 2*/
5097 i = map->level23[16*i+l2];
5098 if (i == 0xFF) {
5099 return -1;
5100 }
5101 /* level 3 */
5102 i = map->level23[16*map->count2 + 128*i + l3];
5103 if (i == 0) {
5104 return -1;
5105 }
5106 return i;
5107}
5108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109/* Lookup the character ch in the mapping. If the character
5110 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005111 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113{
Christian Heimes217cfd12007-12-02 14:31:20 +00005114 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 PyObject *x;
5116
5117 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 x = PyObject_GetItem(mapping, w);
5120 Py_DECREF(w);
5121 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5123 /* No mapping found means: mapping is undefined. */
5124 PyErr_Clear();
5125 x = Py_None;
5126 Py_INCREF(x);
5127 return x;
5128 } else
5129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005131 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005133 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 long value = PyLong_AS_LONG(x);
5135 if (value < 0 || value > 255) {
5136 PyErr_SetString(PyExc_TypeError,
5137 "character mapping must be in range(256)");
5138 Py_DECREF(x);
5139 return NULL;
5140 }
5141 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005143 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 /* wrong return value */
5147 PyErr_Format(PyExc_TypeError,
5148 "character mapping must return integer, bytes or None, not %.400s",
5149 x->ob_type->tp_name);
5150 Py_DECREF(x);
5151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 }
5153}
5154
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005155static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005157{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005158 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5159 /* exponentially overallocate to minimize reallocations */
5160 if (requiredsize < 2*outsize)
5161 requiredsize = 2*outsize;
5162 if (_PyBytes_Resize(outobj, requiredsize))
5163 return -1;
5164 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005165}
5166
Benjamin Peterson14339b62009-01-31 16:36:08 +00005167typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005169}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005171 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 space is available. Return a new reference to the object that
5173 was put in the output buffer, or Py_None, if the mapping was undefined
5174 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005175 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005177charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005180 PyObject *rep;
5181 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005182 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183
Christian Heimes90aa7642007-12-19 02:45:37 +00005184 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005185 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005187 if (res == -1)
5188 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 if (outsize<requiredsize)
5190 if (charmapencode_resize(outobj, outpos, requiredsize))
5191 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005192 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 outstart[(*outpos)++] = (char)res;
5194 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005195 }
5196
5197 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005200 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 Py_DECREF(rep);
5202 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005203 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 if (PyLong_Check(rep)) {
5205 Py_ssize_t requiredsize = *outpos+1;
5206 if (outsize<requiredsize)
5207 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5208 Py_DECREF(rep);
5209 return enc_EXCEPTION;
5210 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005211 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 else {
5215 const char *repchars = PyBytes_AS_STRING(rep);
5216 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5217 Py_ssize_t requiredsize = *outpos+repsize;
5218 if (outsize<requiredsize)
5219 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5220 Py_DECREF(rep);
5221 return enc_EXCEPTION;
5222 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005223 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 memcpy(outstart + *outpos, repchars, repsize);
5225 *outpos += repsize;
5226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005228 Py_DECREF(rep);
5229 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230}
5231
5232/* handle an error in PyUnicode_EncodeCharmap
5233 Return 0 on success, -1 on error */
5234static
5235int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005238 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005239 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005240{
5241 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 Py_ssize_t repsize;
5243 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 Py_UNICODE *uni2;
5245 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t collstartpos = *inpos;
5247 Py_ssize_t collendpos = *inpos+1;
5248 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 char *encoding = "charmap";
5250 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005251 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 /* find all unencodable characters */
5254 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005255 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005256 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 int res = encoding_map_lookup(p[collendpos], mapping);
5258 if (res != -1)
5259 break;
5260 ++collendpos;
5261 continue;
5262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005263
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 rep = charmapencode_lookup(p[collendpos], mapping);
5265 if (rep==NULL)
5266 return -1;
5267 else if (rep!=Py_None) {
5268 Py_DECREF(rep);
5269 break;
5270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005273 }
5274 /* cache callback name lookup
5275 * (if not done yet, i.e. it's the first error) */
5276 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if ((errors==NULL) || (!strcmp(errors, "strict")))
5278 *known_errorHandler = 1;
5279 else if (!strcmp(errors, "replace"))
5280 *known_errorHandler = 2;
5281 else if (!strcmp(errors, "ignore"))
5282 *known_errorHandler = 3;
5283 else if (!strcmp(errors, "xmlcharrefreplace"))
5284 *known_errorHandler = 4;
5285 else
5286 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 }
5288 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005289 case 1: /* strict */
5290 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5291 return -1;
5292 case 2: /* replace */
5293 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 x = charmapencode_output('?', mapping, res, respos);
5295 if (x==enc_EXCEPTION) {
5296 return -1;
5297 }
5298 else if (x==enc_FAILED) {
5299 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5300 return -1;
5301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005302 }
5303 /* fall through */
5304 case 3: /* ignore */
5305 *inpos = collendpos;
5306 break;
5307 case 4: /* xmlcharrefreplace */
5308 /* generate replacement (temporarily (mis)uses p) */
5309 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 char buffer[2+29+1+1];
5311 char *cp;
5312 sprintf(buffer, "&#%d;", (int)p[collpos]);
5313 for (cp = buffer; *cp; ++cp) {
5314 x = charmapencode_output(*cp, mapping, res, respos);
5315 if (x==enc_EXCEPTION)
5316 return -1;
5317 else if (x==enc_FAILED) {
5318 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5319 return -1;
5320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005321 }
5322 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005323 *inpos = collendpos;
5324 break;
5325 default:
5326 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 encoding, reason, p, size, exceptionObject,
5328 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005331 if (PyBytes_Check(repunicode)) {
5332 /* Directly copy bytes result to output. */
5333 Py_ssize_t outsize = PyBytes_Size(*res);
5334 Py_ssize_t requiredsize;
5335 repsize = PyBytes_Size(repunicode);
5336 requiredsize = *respos + repsize;
5337 if (requiredsize > outsize)
5338 /* Make room for all additional bytes. */
5339 if (charmapencode_resize(res, respos, requiredsize)) {
5340 Py_DECREF(repunicode);
5341 return -1;
5342 }
5343 memcpy(PyBytes_AsString(*res) + *respos,
5344 PyBytes_AsString(repunicode), repsize);
5345 *respos += repsize;
5346 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005347 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005348 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005350 /* generate replacement */
5351 repsize = PyUnicode_GET_SIZE(repunicode);
5352 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 x = charmapencode_output(*uni2, mapping, res, respos);
5354 if (x==enc_EXCEPTION) {
5355 return -1;
5356 }
5357 else if (x==enc_FAILED) {
5358 Py_DECREF(repunicode);
5359 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5360 return -1;
5361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 }
5363 *inpos = newpos;
5364 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 }
5366 return 0;
5367}
5368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 Py_ssize_t size,
5371 PyObject *mapping,
5372 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374 /* output object */
5375 PyObject *res = NULL;
5376 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 /* the following variable is used for caching string comparisons
5383 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5384 * 3=ignore, 4=xmlcharrefreplace */
5385 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387 /* Default to Latin-1 */
5388 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 /* allocate enough for a simple encoding without
5392 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005393 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 if (res == NULL)
5395 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005396 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 /* try to encode it */
5401 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5402 if (x==enc_EXCEPTION) /* error */
5403 goto onError;
5404 if (x==enc_FAILED) { /* unencodable character */
5405 if (charmap_encoding_error(p, size, &inpos, mapping,
5406 &exc,
5407 &known_errorHandler, &errorHandler, errors,
5408 &res, &respos)) {
5409 goto onError;
5410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 else
5413 /* done with this character => adjust input position */
5414 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005418 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005419 if (_PyBytes_Resize(&res, respos) < 0)
5420 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 Py_XDECREF(exc);
5423 Py_XDECREF(errorHandler);
5424 return res;
5425
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 Py_XDECREF(res);
5428 Py_XDECREF(exc);
5429 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return NULL;
5431}
5432
5433PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
5436 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 PyErr_BadArgument();
5438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 }
5440 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 PyUnicode_GET_SIZE(unicode),
5442 mapping,
5443 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444}
5445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446/* create or adjust a UnicodeTranslateError */
5447static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 const Py_UNICODE *unicode, Py_ssize_t size,
5449 Py_ssize_t startpos, Py_ssize_t endpos,
5450 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005453 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 }
5456 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5458 goto onError;
5459 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5460 goto onError;
5461 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5462 goto onError;
5463 return;
5464 onError:
5465 Py_DECREF(*exceptionObject);
5466 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 }
5468}
5469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470/* raises a UnicodeTranslateError */
5471static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 const Py_UNICODE *unicode, Py_ssize_t size,
5473 Py_ssize_t startpos, Py_ssize_t endpos,
5474 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475{
5476 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480}
5481
5482/* error handling callback helper:
5483 build arguments, call the callback and check the arguments,
5484 put the result into newpos and return the replacement string, which
5485 has to be freed by the caller */
5486static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 PyObject **errorHandler,
5488 const char *reason,
5489 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5490 Py_ssize_t startpos, Py_ssize_t endpos,
5491 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005493 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005495 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 PyObject *restuple;
5497 PyObject *resunicode;
5498
5499 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 }
5504
5505 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509
5510 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005515 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 Py_DECREF(restuple);
5517 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 }
5519 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 &resunicode, &i_newpos)) {
5521 Py_DECREF(restuple);
5522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005524 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005526 else
5527 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005528 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5530 Py_DECREF(restuple);
5531 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 Py_INCREF(resunicode);
5534 Py_DECREF(restuple);
5535 return resunicode;
5536}
5537
5538/* Lookup the character ch in the mapping and put the result in result,
5539 which must be decrefed by the caller.
5540 Return 0 on success, -1 on error */
5541static
5542int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5543{
Christian Heimes217cfd12007-12-02 14:31:20 +00005544 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 PyObject *x;
5546
5547 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 x = PyObject_GetItem(mapping, w);
5550 Py_DECREF(w);
5551 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5553 /* No mapping found means: use 1:1 mapping. */
5554 PyErr_Clear();
5555 *result = NULL;
5556 return 0;
5557 } else
5558 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 }
5560 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 *result = x;
5562 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005564 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 long value = PyLong_AS_LONG(x);
5566 long max = PyUnicode_GetMax();
5567 if (value < 0 || value > max) {
5568 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005569 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 Py_DECREF(x);
5571 return -1;
5572 }
5573 *result = x;
5574 return 0;
5575 }
5576 else if (PyUnicode_Check(x)) {
5577 *result = x;
5578 return 0;
5579 }
5580 else {
5581 /* wrong return value */
5582 PyErr_SetString(PyExc_TypeError,
5583 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005584 Py_DECREF(x);
5585 return -1;
5586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587}
5588/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 if not reallocate and adjust various state variables.
5590 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591static
Walter Dörwald4894c302003-10-24 14:25:28 +00005592int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005595 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005596 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 /* remember old output position */
5598 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5599 /* exponentially overallocate to minimize reallocations */
5600 if (requiredsize < 2 * oldsize)
5601 requiredsize = 2 * oldsize;
5602 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5603 return -1;
5604 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 }
5606 return 0;
5607}
5608/* lookup the character, put the result in the output string and adjust
5609 various state variables. Return a new reference to the object that
5610 was put in the output buffer in *result, or Py_None, if the mapping was
5611 undefined (in which case no character was written).
5612 The called must decref result.
5613 Return 0 on success, -1 on error. */
5614static
Walter Dörwald4894c302003-10-24 14:25:28 +00005615int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5617 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618{
Walter Dörwald4894c302003-10-24 14:25:28 +00005619 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 /* not found => default to 1:1 mapping */
5623 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 }
5625 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005627 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 /* no overflow check, because we know that the space is enough */
5629 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 }
5631 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5633 if (repsize==1) {
5634 /* no overflow check, because we know that the space is enough */
5635 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5636 }
5637 else if (repsize!=0) {
5638 /* more than one character */
5639 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5640 (insize - (curinp-startinp)) +
5641 repsize - 1;
5642 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5643 return -1;
5644 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5645 *outp += repsize;
5646 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 }
5648 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 return 0;
5651}
5652
5653PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 Py_ssize_t size,
5655 PyObject *mapping,
5656 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 /* output object */
5659 PyObject *res = NULL;
5660 /* pointers to the beginning and end+1 of input */
5661 const Py_UNICODE *startp = p;
5662 const Py_UNICODE *endp = p + size;
5663 /* pointer into the output */
5664 Py_UNICODE *str;
5665 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 char *reason = "character maps to <undefined>";
5668 PyObject *errorHandler = NULL;
5669 PyObject *exc = NULL;
5670 /* the following variable is used for caching string comparisons
5671 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5672 * 3=ignore, 4=xmlcharrefreplace */
5673 int known_errorHandler = -1;
5674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 PyErr_BadArgument();
5677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679
5680 /* allocate enough for a simple 1:1 translation without
5681 replacements, if we need more, we'll resize */
5682 res = PyUnicode_FromUnicode(NULL, size);
5683 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 /* try to encode it */
5691 PyObject *x = NULL;
5692 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5693 Py_XDECREF(x);
5694 goto onError;
5695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005696 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 if (x!=Py_None) /* it worked => adjust input pointer */
5698 ++p;
5699 else { /* untranslatable character */
5700 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5701 Py_ssize_t repsize;
5702 Py_ssize_t newpos;
5703 Py_UNICODE *uni2;
5704 /* startpos for collecting untranslatable chars */
5705 const Py_UNICODE *collstart = p;
5706 const Py_UNICODE *collend = p+1;
5707 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 /* find all untranslatable characters */
5710 while (collend < endp) {
5711 if (charmaptranslate_lookup(*collend, mapping, &x))
5712 goto onError;
5713 Py_XDECREF(x);
5714 if (x!=Py_None)
5715 break;
5716 ++collend;
5717 }
5718 /* cache callback name lookup
5719 * (if not done yet, i.e. it's the first error) */
5720 if (known_errorHandler==-1) {
5721 if ((errors==NULL) || (!strcmp(errors, "strict")))
5722 known_errorHandler = 1;
5723 else if (!strcmp(errors, "replace"))
5724 known_errorHandler = 2;
5725 else if (!strcmp(errors, "ignore"))
5726 known_errorHandler = 3;
5727 else if (!strcmp(errors, "xmlcharrefreplace"))
5728 known_errorHandler = 4;
5729 else
5730 known_errorHandler = 0;
5731 }
5732 switch (known_errorHandler) {
5733 case 1: /* strict */
5734 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005735 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 case 2: /* replace */
5737 /* No need to check for space, this is a 1:1 replacement */
5738 for (coll = collstart; coll<collend; ++coll)
5739 *str++ = '?';
5740 /* fall through */
5741 case 3: /* ignore */
5742 p = collend;
5743 break;
5744 case 4: /* xmlcharrefreplace */
5745 /* generate replacement (temporarily (mis)uses p) */
5746 for (p = collstart; p < collend; ++p) {
5747 char buffer[2+29+1+1];
5748 char *cp;
5749 sprintf(buffer, "&#%d;", (int)*p);
5750 if (charmaptranslate_makespace(&res, &str,
5751 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5752 goto onError;
5753 for (cp = buffer; *cp; ++cp)
5754 *str++ = *cp;
5755 }
5756 p = collend;
5757 break;
5758 default:
5759 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5760 reason, startp, size, &exc,
5761 collstart-startp, collend-startp, &newpos);
5762 if (repunicode == NULL)
5763 goto onError;
5764 /* generate replacement */
5765 repsize = PyUnicode_GET_SIZE(repunicode);
5766 if (charmaptranslate_makespace(&res, &str,
5767 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5768 Py_DECREF(repunicode);
5769 goto onError;
5770 }
5771 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5772 *str++ = *uni2;
5773 p = startp + newpos;
5774 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005776 }
5777 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 /* Resize if we allocated to much */
5779 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005780 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 if (PyUnicode_Resize(&res, respos) < 0)
5782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 }
5784 Py_XDECREF(exc);
5785 Py_XDECREF(errorHandler);
5786 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 Py_XDECREF(res);
5790 Py_XDECREF(exc);
5791 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 return NULL;
5793}
5794
5795PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 PyObject *mapping,
5797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
5799 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005800
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 str = PyUnicode_FromObject(str);
5802 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 PyUnicode_GET_SIZE(str),
5806 mapping,
5807 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 Py_DECREF(str);
5809 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005810
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 Py_XDECREF(str);
5813 return NULL;
5814}
Tim Petersced69f82003-09-16 20:30:58 +00005815
Guido van Rossum9e896b32000-04-05 20:11:21 +00005816/* --- Decimal Encoder ---------------------------------------------------- */
5817
5818int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 Py_ssize_t length,
5820 char *output,
5821 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005822{
5823 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 PyObject *errorHandler = NULL;
5825 PyObject *exc = NULL;
5826 const char *encoding = "decimal";
5827 const char *reason = "invalid decimal Unicode string";
5828 /* the following variable is used for caching string comparisons
5829 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5830 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005831
5832 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 PyErr_BadArgument();
5834 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005835 }
5836
5837 p = s;
5838 end = s + length;
5839 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 register Py_UNICODE ch = *p;
5841 int decimal;
5842 PyObject *repunicode;
5843 Py_ssize_t repsize;
5844 Py_ssize_t newpos;
5845 Py_UNICODE *uni2;
5846 Py_UNICODE *collstart;
5847 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005848
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005850 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 ++p;
5852 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005853 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 decimal = Py_UNICODE_TODECIMAL(ch);
5855 if (decimal >= 0) {
5856 *output++ = '0' + decimal;
5857 ++p;
5858 continue;
5859 }
5860 if (0 < ch && ch < 256) {
5861 *output++ = (char)ch;
5862 ++p;
5863 continue;
5864 }
5865 /* All other characters are considered unencodable */
5866 collstart = p;
5867 collend = p+1;
5868 while (collend < end) {
5869 if ((0 < *collend && *collend < 256) ||
5870 !Py_UNICODE_ISSPACE(*collend) ||
5871 Py_UNICODE_TODECIMAL(*collend))
5872 break;
5873 }
5874 /* cache callback name lookup
5875 * (if not done yet, i.e. it's the first error) */
5876 if (known_errorHandler==-1) {
5877 if ((errors==NULL) || (!strcmp(errors, "strict")))
5878 known_errorHandler = 1;
5879 else if (!strcmp(errors, "replace"))
5880 known_errorHandler = 2;
5881 else if (!strcmp(errors, "ignore"))
5882 known_errorHandler = 3;
5883 else if (!strcmp(errors, "xmlcharrefreplace"))
5884 known_errorHandler = 4;
5885 else
5886 known_errorHandler = 0;
5887 }
5888 switch (known_errorHandler) {
5889 case 1: /* strict */
5890 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5891 goto onError;
5892 case 2: /* replace */
5893 for (p = collstart; p < collend; ++p)
5894 *output++ = '?';
5895 /* fall through */
5896 case 3: /* ignore */
5897 p = collend;
5898 break;
5899 case 4: /* xmlcharrefreplace */
5900 /* generate replacement (temporarily (mis)uses p) */
5901 for (p = collstart; p < collend; ++p)
5902 output += sprintf(output, "&#%d;", (int)*p);
5903 p = collend;
5904 break;
5905 default:
5906 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5907 encoding, reason, s, length, &exc,
5908 collstart-s, collend-s, &newpos);
5909 if (repunicode == NULL)
5910 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005911 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005912 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005913 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5914 Py_DECREF(repunicode);
5915 goto onError;
5916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* generate replacement */
5918 repsize = PyUnicode_GET_SIZE(repunicode);
5919 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5920 Py_UNICODE ch = *uni2;
5921 if (Py_UNICODE_ISSPACE(ch))
5922 *output++ = ' ';
5923 else {
5924 decimal = Py_UNICODE_TODECIMAL(ch);
5925 if (decimal >= 0)
5926 *output++ = '0' + decimal;
5927 else if (0 < ch && ch < 256)
5928 *output++ = (char)ch;
5929 else {
5930 Py_DECREF(repunicode);
5931 raise_encode_exception(&exc, encoding,
5932 s, length, collstart-s, collend-s, reason);
5933 goto onError;
5934 }
5935 }
5936 }
5937 p = s + newpos;
5938 Py_DECREF(repunicode);
5939 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005940 }
5941 /* 0-terminate the output string */
5942 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 Py_XDECREF(exc);
5944 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005945 return 0;
5946
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 Py_XDECREF(exc);
5949 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005950 return -1;
5951}
5952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953/* --- Helpers ------------------------------------------------------------ */
5954
Eric Smith8c663262007-08-25 02:26:07 +00005955#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005956#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005957
Thomas Wouters477c8d52006-05-27 19:21:47 +00005958#include "stringlib/count.h"
5959#include "stringlib/find.h"
5960#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005961#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962
Eric Smith5807c412008-05-11 21:00:57 +00005963#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005964#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005965#include "stringlib/localeutil.h"
5966
Thomas Wouters477c8d52006-05-27 19:21:47 +00005967/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005968#define ADJUST_INDICES(start, end, len) \
5969 if (end > len) \
5970 end = len; \
5971 else if (end < 0) { \
5972 end += len; \
5973 if (end < 0) \
5974 end = 0; \
5975 } \
5976 if (start < 0) { \
5977 start += len; \
5978 if (start < 0) \
5979 start = 0; \
5980 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005981
Martin v. Löwis18e16552006-02-15 17:27:45 +00005982Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005983 PyObject *substr,
5984 Py_ssize_t start,
5985 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005987 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988 PyUnicodeObject* str_obj;
5989 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005990
Thomas Wouters477c8d52006-05-27 19:21:47 +00005991 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5992 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005994 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5995 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 Py_DECREF(str_obj);
5997 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 }
Tim Petersced69f82003-09-16 20:30:58 +00005999
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006000 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006001 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006002 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6003 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 );
6005
6006 Py_DECREF(sub_obj);
6007 Py_DECREF(str_obj);
6008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 return result;
6010}
6011
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013 PyObject *sub,
6014 Py_ssize_t start,
6015 Py_ssize_t end,
6016 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006018 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006021 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 sub = PyUnicode_FromObject(sub);
6024 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 Py_DECREF(str);
6026 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 }
Tim Petersced69f82003-09-16 20:30:58 +00006028
Thomas Wouters477c8d52006-05-27 19:21:47 +00006029 if (direction > 0)
6030 result = stringlib_find_slice(
6031 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6032 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6033 start, end
6034 );
6035 else
6036 result = stringlib_rfind_slice(
6037 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6038 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6039 start, end
6040 );
6041
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043 Py_DECREF(sub);
6044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 return result;
6046}
6047
Tim Petersced69f82003-09-16 20:30:58 +00006048static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 PyUnicodeObject *substring,
6051 Py_ssize_t start,
6052 Py_ssize_t end,
6053 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 if (substring->length == 0)
6056 return 1;
6057
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006058 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 end -= substring->length;
6060 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 if (Py_UNICODE_MATCH(self, end, substring))
6065 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 } else {
6067 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 }
6070
6071 return 0;
6072}
6073
Martin v. Löwis18e16552006-02-15 17:27:45 +00006074Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 PyObject *substr,
6076 Py_ssize_t start,
6077 Py_ssize_t end,
6078 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 str = PyUnicode_FromObject(str);
6083 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 substr = PyUnicode_FromObject(substr);
6086 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 Py_DECREF(str);
6088 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
Tim Petersced69f82003-09-16 20:30:58 +00006090
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 (PyUnicodeObject *)substr,
6093 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 Py_DECREF(str);
6095 Py_DECREF(substr);
6096 return result;
6097}
6098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099/* Apply fixfct filter to the Unicode object self and return a
6100 reference to the modified object */
6101
Tim Petersced69f82003-09-16 20:30:58 +00006102static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
6106
6107 PyUnicodeObject *u;
6108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006109 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006112
6113 Py_UNICODE_COPY(u->str, self->str, self->length);
6114
Tim Peters7a29bd52001-09-12 03:03:31 +00006115 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 /* fixfct should return TRUE if it modified the buffer. If
6117 FALSE, return a reference to the original buffer instead
6118 (to save space, not time) */
6119 Py_INCREF(self);
6120 Py_DECREF(u);
6121 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 }
6123 return (PyObject*) u;
6124}
6125
Tim Petersced69f82003-09-16 20:30:58 +00006126static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127int fixupper(PyUnicodeObject *self)
6128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 Py_UNICODE *s = self->str;
6131 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006135
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 ch = Py_UNICODE_TOUPPER(*s);
6137 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 *s = ch;
6140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 s++;
6142 }
6143
6144 return status;
6145}
6146
Tim Petersced69f82003-09-16 20:30:58 +00006147static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148int fixlower(PyUnicodeObject *self)
6149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006150 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 Py_UNICODE *s = self->str;
6152 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006156
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 ch = Py_UNICODE_TOLOWER(*s);
6158 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 *s = ch;
6161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 s++;
6163 }
6164
6165 return status;
6166}
6167
Tim Petersced69f82003-09-16 20:30:58 +00006168static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169int fixswapcase(PyUnicodeObject *self)
6170{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006171 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 Py_UNICODE *s = self->str;
6173 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 while (len-- > 0) {
6176 if (Py_UNICODE_ISUPPER(*s)) {
6177 *s = Py_UNICODE_TOLOWER(*s);
6178 status = 1;
6179 } else if (Py_UNICODE_ISLOWER(*s)) {
6180 *s = Py_UNICODE_TOUPPER(*s);
6181 status = 1;
6182 }
6183 s++;
6184 }
6185
6186 return status;
6187}
6188
Tim Petersced69f82003-09-16 20:30:58 +00006189static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190int fixcapitalize(PyUnicodeObject *self)
6191{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006193 Py_UNICODE *s = self->str;
6194 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006195
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006196 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006198 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 *s = Py_UNICODE_TOUPPER(*s);
6200 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006202 s++;
6203 while (--len > 0) {
6204 if (Py_UNICODE_ISUPPER(*s)) {
6205 *s = Py_UNICODE_TOLOWER(*s);
6206 status = 1;
6207 }
6208 s++;
6209 }
6210 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211}
6212
6213static
6214int fixtitle(PyUnicodeObject *self)
6215{
6216 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6217 register Py_UNICODE *e;
6218 int previous_is_cased;
6219
6220 /* Shortcut for single character strings */
6221 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6223 if (*p != ch) {
6224 *p = ch;
6225 return 1;
6226 }
6227 else
6228 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
Tim Petersced69f82003-09-16 20:30:58 +00006230
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 e = p + PyUnicode_GET_SIZE(self);
6232 previous_is_cased = 0;
6233 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006235
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 if (previous_is_cased)
6237 *p = Py_UNICODE_TOLOWER(ch);
6238 else
6239 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006240
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 if (Py_UNICODE_ISLOWER(ch) ||
6242 Py_UNICODE_ISUPPER(ch) ||
6243 Py_UNICODE_ISTITLE(ch))
6244 previous_is_cased = 1;
6245 else
6246 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
6248 return 1;
6249}
6250
Tim Peters8ce9f162004-08-27 01:49:32 +00006251PyObject *
6252PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253{
Skip Montanaro6543b452004-09-16 03:28:13 +00006254 const Py_UNICODE blank = ' ';
6255 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006256 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006257 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006258 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6259 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006260 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6261 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006262 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006263 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Tim Peters05eba1f2004-08-27 21:32:02 +00006265 fseq = PySequence_Fast(seq, "");
6266 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006267 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006268 }
6269
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006270 /* NOTE: the following code can't call back into Python code,
6271 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006272 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006273
Tim Peters05eba1f2004-08-27 21:32:02 +00006274 seqlen = PySequence_Fast_GET_SIZE(fseq);
6275 /* If empty sequence, return u"". */
6276 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006277 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6278 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006279 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006280 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006281 /* If singleton sequence with an exact Unicode, return that. */
6282 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 item = items[0];
6284 if (PyUnicode_CheckExact(item)) {
6285 Py_INCREF(item);
6286 res = (PyUnicodeObject *)item;
6287 goto Done;
6288 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006289 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006290 else {
6291 /* Set up sep and seplen */
6292 if (separator == NULL) {
6293 sep = &blank;
6294 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006295 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006296 else {
6297 if (!PyUnicode_Check(separator)) {
6298 PyErr_Format(PyExc_TypeError,
6299 "separator: expected str instance,"
6300 " %.80s found",
6301 Py_TYPE(separator)->tp_name);
6302 goto onError;
6303 }
6304 sep = PyUnicode_AS_UNICODE(separator);
6305 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006306 }
6307 }
6308
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006309 /* There are at least two things to join, or else we have a subclass
6310 * of str in the sequence.
6311 * Do a pre-pass to figure out the total amount of space we'll
6312 * need (sz), and see whether all argument are strings.
6313 */
6314 sz = 0;
6315 for (i = 0; i < seqlen; i++) {
6316 const Py_ssize_t old_sz = sz;
6317 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 if (!PyUnicode_Check(item)) {
6319 PyErr_Format(PyExc_TypeError,
6320 "sequence item %zd: expected str instance,"
6321 " %.80s found",
6322 i, Py_TYPE(item)->tp_name);
6323 goto onError;
6324 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006325 sz += PyUnicode_GET_SIZE(item);
6326 if (i != 0)
6327 sz += seplen;
6328 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6329 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006331 goto onError;
6332 }
6333 }
Tim Petersced69f82003-09-16 20:30:58 +00006334
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006335 res = _PyUnicode_New(sz);
6336 if (res == NULL)
6337 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006338
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006339 /* Catenate everything. */
6340 res_p = PyUnicode_AS_UNICODE(res);
6341 for (i = 0; i < seqlen; ++i) {
6342 Py_ssize_t itemlen;
6343 item = items[i];
6344 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 /* Copy item, and maybe the separator. */
6346 if (i) {
6347 Py_UNICODE_COPY(res_p, sep, seplen);
6348 res_p += seplen;
6349 }
6350 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6351 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006352 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006353
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006355 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 return (PyObject *)res;
6357
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006359 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006360 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 return NULL;
6362}
6363
Tim Petersced69f82003-09-16 20:30:58 +00006364static
6365PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 Py_ssize_t left,
6367 Py_ssize_t right,
6368 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369{
6370 PyUnicodeObject *u;
6371
6372 if (left < 0)
6373 left = 0;
6374 if (right < 0)
6375 right = 0;
6376
Tim Peters7a29bd52001-09-12 03:03:31 +00006377 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 Py_INCREF(self);
6379 return self;
6380 }
6381
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006382 if (left > PY_SSIZE_T_MAX - self->length ||
6383 right > PY_SSIZE_T_MAX - (left + self->length)) {
6384 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6385 return NULL;
6386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 u = _PyUnicode_New(left + self->length + right);
6388 if (u) {
6389 if (left)
6390 Py_UNICODE_FILL(u->str, fill, left);
6391 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6392 if (right)
6393 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6394 }
6395
6396 return u;
6397}
6398
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006399PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
6403 string = PyUnicode_FromObject(string);
6404 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006407 list = stringlib_splitlines(
6408 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6409 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410
6411 Py_DECREF(string);
6412 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413}
6414
Tim Petersced69f82003-09-16 20:30:58 +00006415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 PyUnicodeObject *substring,
6418 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006421 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006424 return stringlib_split_whitespace(
6425 (PyObject*) self, self->str, self->length, maxcount
6426 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006428 return stringlib_split(
6429 (PyObject*) self, self->str, self->length,
6430 substring->str, substring->length,
6431 maxcount
6432 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433}
6434
Tim Petersced69f82003-09-16 20:30:58 +00006435static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006436PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyUnicodeObject *substring,
6438 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006439{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006440 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006441 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006442
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006444 return stringlib_rsplit_whitespace(
6445 (PyObject*) self, self->str, self->length, maxcount
6446 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006447
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006448 return stringlib_rsplit(
6449 (PyObject*) self, self->str, self->length,
6450 substring->str, substring->length,
6451 maxcount
6452 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006453}
6454
6455static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 PyUnicodeObject *str1,
6458 PyUnicodeObject *str2,
6459 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460{
6461 PyUnicodeObject *u;
6462
6463 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006465 else if (maxcount == 0 || self->length == 0)
6466 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
Thomas Wouters477c8d52006-05-27 19:21:47 +00006468 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006469 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006471 if (str1->length == 0)
6472 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473 if (str1->length == 1) {
6474 /* replace characters */
6475 Py_UNICODE u1, u2;
6476 if (!findchar(self->str, self->length, str1->str[0]))
6477 goto nothing;
6478 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6479 if (!u)
6480 return NULL;
6481 Py_UNICODE_COPY(u->str, self->str, self->length);
6482 u1 = str1->str[0];
6483 u2 = str2->str[0];
6484 for (i = 0; i < u->length; i++)
6485 if (u->str[i] == u1) {
6486 if (--maxcount < 0)
6487 break;
6488 u->str[i] = u2;
6489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006491 i = stringlib_find(
6492 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006494 if (i < 0)
6495 goto nothing;
6496 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6497 if (!u)
6498 return NULL;
6499 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006500
6501 /* change everything in-place, starting with this one */
6502 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6503 i += str1->length;
6504
6505 while ( --maxcount > 0) {
6506 i = stringlib_find(self->str+i, self->length-i,
6507 str1->str, str1->length,
6508 i);
6509 if (i == -1)
6510 break;
6511 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6512 i += str1->length;
6513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516
6517 Py_ssize_t n, i, j, e;
6518 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 Py_UNICODE *p;
6520
6521 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006522 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6523 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 if (n == 0)
6525 goto nothing;
6526 /* new_size = self->length + n * (str2->length - str1->length)); */
6527 delta = (str2->length - str1->length);
6528 if (delta == 0) {
6529 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006531 product = n * (str2->length - str1->length);
6532 if ((product / (str2->length - str1->length)) != n) {
6533 PyErr_SetString(PyExc_OverflowError,
6534 "replace string is too long");
6535 return NULL;
6536 }
6537 new_size = self->length + product;
6538 if (new_size < 0) {
6539 PyErr_SetString(PyExc_OverflowError,
6540 "replace string is too long");
6541 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
6543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006544 u = _PyUnicode_New(new_size);
6545 if (!u)
6546 return NULL;
6547 i = 0;
6548 p = u->str;
6549 e = self->length - str1->length;
6550 if (str1->length > 0) {
6551 while (n-- > 0) {
6552 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006553 j = stringlib_find(self->str+i, self->length-i,
6554 str1->str, str1->length,
6555 i);
6556 if (j == -1)
6557 break;
6558 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006559 /* copy unchanged part [i:j] */
6560 Py_UNICODE_COPY(p, self->str+i, j-i);
6561 p += j - i;
6562 }
6563 /* copy substitution string */
6564 if (str2->length > 0) {
6565 Py_UNICODE_COPY(p, str2->str, str2->length);
6566 p += str2->length;
6567 }
6568 i = j + str1->length;
6569 }
6570 if (i < self->length)
6571 /* copy tail [i:] */
6572 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6573 } else {
6574 /* interleave */
6575 while (n > 0) {
6576 Py_UNICODE_COPY(p, str2->str, str2->length);
6577 p += str2->length;
6578 if (--n <= 0)
6579 break;
6580 *p++ = self->str[i++];
6581 }
6582 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588 /* nothing to replace; return original string (when possible) */
6589 if (PyUnicode_CheckExact(self)) {
6590 Py_INCREF(self);
6591 return (PyObject *) self;
6592 }
6593 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
6596/* --- Unicode Object Methods --------------------------------------------- */
6597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006598PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600\n\
6601Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006605unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 return fixup(self, fixtitle);
6608}
6609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612\n\
6613Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
6616static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006617unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 return fixup(self, fixcapitalize);
6620}
6621
6622#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625\n\
6626Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006630unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
6632 PyObject *list;
6633 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006634 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 /* Split into words */
6637 list = split(self, NULL, -1);
6638 if (!list)
6639 return NULL;
6640
6641 /* Capitalize each word */
6642 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6643 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 if (item == NULL)
6646 goto onError;
6647 Py_DECREF(PyList_GET_ITEM(list, i));
6648 PyList_SET_ITEM(list, i, item);
6649 }
6650
6651 /* Join the words to form a new string */
6652 item = PyUnicode_Join(NULL, list);
6653
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 Py_DECREF(list);
6656 return (PyObject *)item;
6657}
6658#endif
6659
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006660/* Argument converter. Coerces to a single unicode character */
6661
6662static int
6663convert_uc(PyObject *obj, void *addr)
6664{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006665 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6666 PyObject *uniobj;
6667 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006668
Benjamin Peterson14339b62009-01-31 16:36:08 +00006669 uniobj = PyUnicode_FromObject(obj);
6670 if (uniobj == NULL) {
6671 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006673 return 0;
6674 }
6675 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6676 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006678 Py_DECREF(uniobj);
6679 return 0;
6680 }
6681 unistr = PyUnicode_AS_UNICODE(uniobj);
6682 *fillcharloc = unistr[0];
6683 Py_DECREF(uniobj);
6684 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006685}
6686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006687PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006690Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006691done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693static PyObject *
6694unicode_center(PyUnicodeObject *self, PyObject *args)
6695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t marg, left;
6697 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006698 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
Thomas Woutersde017742006-02-16 19:34:37 +00006700 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 return NULL;
6702
Tim Peters7a29bd52001-09-12 03:03:31 +00006703 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 Py_INCREF(self);
6705 return (PyObject*) self;
6706 }
6707
6708 marg = width - self->length;
6709 left = marg / 2 + (marg & width & 1);
6710
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006711 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712}
6713
Marc-André Lemburge5034372000-08-08 08:04:29 +00006714#if 0
6715
6716/* This code should go into some future Unicode collation support
6717 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006718 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006719
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006720/* speedy UTF-16 code point order comparison */
6721/* gleaned from: */
6722/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6723
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006724static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006725{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006726 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006727 0, 0, 0, 0, 0, 0, 0, 0,
6728 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006729 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006730};
6731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732static int
6733unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006735 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 Py_UNICODE *s1 = str1->str;
6738 Py_UNICODE *s2 = str2->str;
6739
6740 len1 = str1->length;
6741 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006744 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006745
6746 c1 = *s1++;
6747 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006748
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 if (c1 > (1<<11) * 26)
6750 c1 += utf16Fixup[c1>>11];
6751 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006752 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006753 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006754
6755 if (c1 != c2)
6756 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006757
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006758 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 }
6760
6761 return (len1 < len2) ? -1 : (len1 != len2);
6762}
6763
Marc-André Lemburge5034372000-08-08 08:04:29 +00006764#else
6765
6766static int
6767unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6768{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006770
6771 Py_UNICODE *s1 = str1->str;
6772 Py_UNICODE *s2 = str2->str;
6773
6774 len1 = str1->length;
6775 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006776
Marc-André Lemburge5034372000-08-08 08:04:29 +00006777 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006778 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006779
Fredrik Lundh45714e92001-06-26 16:39:36 +00006780 c1 = *s1++;
6781 c2 = *s2++;
6782
6783 if (c1 != c2)
6784 return (c1 < c2) ? -1 : 1;
6785
Marc-André Lemburge5034372000-08-08 08:04:29 +00006786 len1--; len2--;
6787 }
6788
6789 return (len1 < len2) ? -1 : (len1 != len2);
6790}
6791
6792#endif
6793
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006797 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6798 return unicode_compare((PyUnicodeObject *)left,
6799 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006800 PyErr_Format(PyExc_TypeError,
6801 "Can't compare %.100s and %.100s",
6802 left->ob_type->tp_name,
6803 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 return -1;
6805}
6806
Martin v. Löwis5b222132007-06-10 09:51:05 +00006807int
6808PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6809{
6810 int i;
6811 Py_UNICODE *id;
6812 assert(PyUnicode_Check(uni));
6813 id = PyUnicode_AS_UNICODE(uni);
6814 /* Compare Unicode string and source character set string */
6815 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 if (id[i] != str[i])
6817 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006818 /* This check keeps Python strings that end in '\0' from comparing equal
6819 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006820 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006822 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006824 return 0;
6825}
6826
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006827
Benjamin Peterson29060642009-01-31 22:14:21 +00006828#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006829 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006830
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006831PyObject *PyUnicode_RichCompare(PyObject *left,
6832 PyObject *right,
6833 int op)
6834{
6835 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006837 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6838 PyObject *v;
6839 if (((PyUnicodeObject *) left)->length !=
6840 ((PyUnicodeObject *) right)->length) {
6841 if (op == Py_EQ) {
6842 Py_INCREF(Py_False);
6843 return Py_False;
6844 }
6845 if (op == Py_NE) {
6846 Py_INCREF(Py_True);
6847 return Py_True;
6848 }
6849 }
6850 if (left == right)
6851 result = 0;
6852 else
6853 result = unicode_compare((PyUnicodeObject *)left,
6854 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006855
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006856 /* Convert the return value to a Boolean */
6857 switch (op) {
6858 case Py_EQ:
6859 v = TEST_COND(result == 0);
6860 break;
6861 case Py_NE:
6862 v = TEST_COND(result != 0);
6863 break;
6864 case Py_LE:
6865 v = TEST_COND(result <= 0);
6866 break;
6867 case Py_GE:
6868 v = TEST_COND(result >= 0);
6869 break;
6870 case Py_LT:
6871 v = TEST_COND(result == -1);
6872 break;
6873 case Py_GT:
6874 v = TEST_COND(result == 1);
6875 break;
6876 default:
6877 PyErr_BadArgument();
6878 return NULL;
6879 }
6880 Py_INCREF(v);
6881 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006884 Py_INCREF(Py_NotImplemented);
6885 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006886}
6887
Guido van Rossum403d68b2000-03-13 15:55:09 +00006888int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006890{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006891 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006893
6894 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006895 sub = PyUnicode_FromObject(element);
6896 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 PyErr_Format(PyExc_TypeError,
6898 "'in <string>' requires string as left operand, not %s",
6899 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006900 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006901 }
6902
Thomas Wouters477c8d52006-05-27 19:21:47 +00006903 str = PyUnicode_FromObject(container);
6904 if (!str) {
6905 Py_DECREF(sub);
6906 return -1;
6907 }
6908
6909 result = stringlib_contains_obj(str, sub);
6910
6911 Py_DECREF(str);
6912 Py_DECREF(sub);
6913
Guido van Rossum403d68b2000-03-13 15:55:09 +00006914 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006915}
6916
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917/* Concat to string or Unicode object giving a new Unicode object. */
6918
6919PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921{
6922 PyUnicodeObject *u = NULL, *v = NULL, *w;
6923
6924 /* Coerce the two arguments */
6925 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6926 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6929 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932 /* Shortcuts */
6933 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 Py_DECREF(v);
6935 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
6937 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 Py_DECREF(u);
6939 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
6941
6942 /* Concat the two Unicode strings */
6943 w = _PyUnicode_New(u->length + v->length);
6944 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 Py_UNICODE_COPY(w->str, u->str, u->length);
6947 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6948
6949 Py_DECREF(u);
6950 Py_DECREF(v);
6951 return (PyObject *)w;
6952
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 Py_XDECREF(u);
6955 Py_XDECREF(v);
6956 return NULL;
6957}
6958
Walter Dörwald1ab83302007-05-18 17:15:44 +00006959void
6960PyUnicode_Append(PyObject **pleft, PyObject *right)
6961{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 PyObject *new;
6963 if (*pleft == NULL)
6964 return;
6965 if (right == NULL || !PyUnicode_Check(*pleft)) {
6966 Py_DECREF(*pleft);
6967 *pleft = NULL;
6968 return;
6969 }
6970 new = PyUnicode_Concat(*pleft, right);
6971 Py_DECREF(*pleft);
6972 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006973}
6974
6975void
6976PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6977{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006978 PyUnicode_Append(pleft, right);
6979 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006980}
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006986string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject *
6990unicode_count(PyUnicodeObject *self, PyObject *args)
6991{
6992 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006993 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006994 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 PyObject *result;
6996
Guido van Rossumb8872e62000-05-09 14:14:27 +00006997 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
7000
7001 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007005
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007006 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007007 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007008 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007009 substring->str, substring->length,
7010 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007011 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
7013 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 return result;
7016}
7017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007018PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007021Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007022to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007023handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7025'xmlcharrefreplace' as well as any other name registered with\n\
7026codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027
7028static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007029unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007031 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 char *encoding = NULL;
7033 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007034 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007035
Benjamin Peterson308d6372009-09-18 21:42:35 +00007036 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7037 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007039 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007040 if (v == NULL)
7041 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007042 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007043 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007044 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007045 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007046 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007047 Py_DECREF(v);
7048 return NULL;
7049 }
7050 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007051
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007053 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007054}
7055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007056PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058\n\
7059Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007060If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
7062static PyObject*
7063unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7064{
7065 Py_UNICODE *e;
7066 Py_UNICODE *p;
7067 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007068 Py_UNICODE *qe;
7069 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 PyUnicodeObject *u;
7071 int tabsize = 8;
7072
7073 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
Thomas Wouters7e474022000-07-16 12:04:32 +00007076 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007077 i = 0; /* chars up to and including most recent \n or \r */
7078 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7079 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 for (p = self->str; p < e; p++)
7081 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 if (tabsize > 0) {
7083 incr = tabsize - (j % tabsize); /* cannot overflow */
7084 if (j > PY_SSIZE_T_MAX - incr)
7085 goto overflow1;
7086 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 if (j > PY_SSIZE_T_MAX - 1)
7091 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 j++;
7093 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 if (i > PY_SSIZE_T_MAX - j)
7095 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007097 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 }
7099 }
7100
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007101 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 /* Second pass: create output string and fill it */
7105 u = _PyUnicode_New(i + j);
7106 if (!u)
7107 return NULL;
7108
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007109 j = 0; /* same as in first pass */
7110 q = u->str; /* next output char */
7111 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113 for (p = self->str; p < e; p++)
7114 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 if (tabsize > 0) {
7116 i = tabsize - (j % tabsize);
7117 j += i;
7118 while (i--) {
7119 if (q >= qe)
7120 goto overflow2;
7121 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007124 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 else {
7126 if (q >= qe)
7127 goto overflow2;
7128 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007129 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 if (*p == '\n' || *p == '\r')
7131 j = 0;
7132 }
7133
7134 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007135
7136 overflow2:
7137 Py_DECREF(u);
7138 overflow1:
7139 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007143PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145\n\
7146Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007147such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148arguments start and end are interpreted as in slice notation.\n\
7149\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152static PyObject *
7153unicode_find(PyUnicodeObject *self, PyObject *args)
7154{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007155 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007156 Py_ssize_t start;
7157 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
Christian Heimes9cd17752007-11-18 19:35:23 +00007160 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 result = stringlib_find_slice(
7164 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7165 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7166 start, end
7167 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
7169 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007170
Christian Heimes217cfd12007-12-02 14:31:20 +00007171 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172}
7173
7174static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007175unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176{
7177 if (index < 0 || index >= self->length) {
7178 PyErr_SetString(PyExc_IndexError, "string index out of range");
7179 return NULL;
7180 }
7181
7182 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7183}
7184
Guido van Rossumc2504932007-09-18 19:42:40 +00007185/* Believe it or not, this produces the same value for ASCII strings
7186 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007188unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189{
Guido van Rossumc2504932007-09-18 19:42:40 +00007190 Py_ssize_t len;
7191 Py_UNICODE *p;
7192 long x;
7193
7194 if (self->hash != -1)
7195 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007196 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007197 p = self->str;
7198 x = *p << 7;
7199 while (--len >= 0)
7200 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007201 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007202 if (x == -1)
7203 x = -2;
7204 self->hash = x;
7205 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213static PyObject *
7214unicode_index(PyUnicodeObject *self, PyObject *args)
7215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007216 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007217 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007218 Py_ssize_t start;
7219 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220
Christian Heimes9cd17752007-11-18 19:35:23 +00007221 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
Thomas Wouters477c8d52006-05-27 19:21:47 +00007224 result = stringlib_find_slice(
7225 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7226 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7227 start, end
7228 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229
7230 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 if (result < 0) {
7233 PyErr_SetString(PyExc_ValueError, "substring not found");
7234 return NULL;
7235 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007236
Christian Heimes217cfd12007-12-02 14:31:20 +00007237 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238}
7239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007240PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007243Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
7246static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007247unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248{
7249 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7250 register const Py_UNICODE *e;
7251 int cased;
7252
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 /* Shortcut for single character strings */
7254 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007257 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007258 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 e = p + PyUnicode_GET_SIZE(self);
7262 cased = 0;
7263 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007265
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7267 return PyBool_FromLong(0);
7268 else if (!cased && Py_UNICODE_ISLOWER(ch))
7269 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007271 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272}
7273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007274PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007277Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007281unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282{
7283 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7284 register const Py_UNICODE *e;
7285 int cased;
7286
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 /* Shortcut for single character strings */
7288 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007291 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007292 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007294
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 e = p + PyUnicode_GET_SIZE(self);
7296 cased = 0;
7297 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007299
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7301 return PyBool_FromLong(0);
7302 else if (!cased && Py_UNICODE_ISUPPER(ch))
7303 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007305 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306}
7307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007308PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007311Return True if S is a titlecased string and there is at least one\n\
7312character in S, i.e. upper- and titlecase characters may only\n\
7313follow uncased characters and lowercase characters only cased ones.\n\
7314Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007317unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318{
7319 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7320 register const Py_UNICODE *e;
7321 int cased, previous_is_cased;
7322
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 /* Shortcut for single character strings */
7324 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7326 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007328 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007329 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007331
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 e = p + PyUnicode_GET_SIZE(self);
7333 cased = 0;
7334 previous_is_cased = 0;
7335 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007337
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7339 if (previous_is_cased)
7340 return PyBool_FromLong(0);
7341 previous_is_cased = 1;
7342 cased = 1;
7343 }
7344 else if (Py_UNICODE_ISLOWER(ch)) {
7345 if (!previous_is_cased)
7346 return PyBool_FromLong(0);
7347 previous_is_cased = 1;
7348 cased = 1;
7349 }
7350 else
7351 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007353 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354}
7355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007356PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007359Return True if all characters in S are whitespace\n\
7360and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007363unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364{
7365 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7366 register const Py_UNICODE *e;
7367
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 /* Shortcut for single character strings */
7369 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 Py_UNICODE_ISSPACE(*p))
7371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007373 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007374 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 e = p + PyUnicode_GET_SIZE(self);
7378 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 if (!Py_UNICODE_ISSPACE(*p))
7380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383}
7384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007387\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007388Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007389and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007390
7391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007392unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007393{
7394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7395 register const Py_UNICODE *e;
7396
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007397 /* Shortcut for single character strings */
7398 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 Py_UNICODE_ISALPHA(*p))
7400 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007401
7402 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007403 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007405
7406 e = p + PyUnicode_GET_SIZE(self);
7407 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 if (!Py_UNICODE_ISALPHA(*p))
7409 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007410 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007411 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007412}
7413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007414PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007416\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007417Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007419
7420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007421unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007422{
7423 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7424 register const Py_UNICODE *e;
7425
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007426 /* Shortcut for single character strings */
7427 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 Py_UNICODE_ISALNUM(*p))
7429 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007430
7431 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007432 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007434
7435 e = p + PyUnicode_GET_SIZE(self);
7436 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 if (!Py_UNICODE_ISALNUM(*p))
7438 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007440 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007441}
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007446Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007450unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451{
7452 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7453 register const Py_UNICODE *e;
7454
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 /* Shortcut for single character strings */
7456 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 Py_UNICODE_ISDECIMAL(*p))
7458 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007460 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007461 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007463
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 e = p + PyUnicode_GET_SIZE(self);
7465 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 if (!Py_UNICODE_ISDECIMAL(*p))
7467 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007469 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470}
7471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007475Return True if all characters in S are digits\n\
7476and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
7478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007479unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480{
7481 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7482 register const Py_UNICODE *e;
7483
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 /* Shortcut for single character strings */
7485 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 Py_UNICODE_ISDIGIT(*p))
7487 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007489 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007490 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007492
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 e = p + PyUnicode_GET_SIZE(self);
7494 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 if (!Py_UNICODE_ISDIGIT(*p))
7496 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499}
7500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007504Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007505False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007508unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509{
7510 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7511 register const Py_UNICODE *e;
7512
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 /* Shortcut for single character strings */
7514 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 Py_UNICODE_ISNUMERIC(*p))
7516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007518 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007519 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007521
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 e = p + PyUnicode_GET_SIZE(self);
7523 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 if (!Py_UNICODE_ISNUMERIC(*p))
7525 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007527 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528}
7529
Martin v. Löwis47383402007-08-15 07:32:56 +00007530int
7531PyUnicode_IsIdentifier(PyObject *self)
7532{
7533 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7534 register const Py_UNICODE *e;
7535
7536 /* Special case for empty strings */
7537 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007539
7540 /* PEP 3131 says that the first character must be in
7541 XID_Start and subsequent characters in XID_Continue,
7542 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007543 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007544 letters, digits, underscore). However, given the current
7545 definition of XID_Start and XID_Continue, it is sufficient
7546 to check just for these, except that _ must be allowed
7547 as starting an identifier. */
7548 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7549 return 0;
7550
7551 e = p + PyUnicode_GET_SIZE(self);
7552 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 if (!_PyUnicode_IsXidContinue(*p))
7554 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007555 }
7556 return 1;
7557}
7558
7559PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007561\n\
7562Return True if S is a valid identifier according\n\
7563to the language definition.");
7564
7565static PyObject*
7566unicode_isidentifier(PyObject *self)
7567{
7568 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7569}
7570
Georg Brandl559e5d72008-06-11 18:37:52 +00007571PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007573\n\
7574Return True if all characters in S are considered\n\
7575printable in repr() or S is empty, False otherwise.");
7576
7577static PyObject*
7578unicode_isprintable(PyObject *self)
7579{
7580 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7581 register const Py_UNICODE *e;
7582
7583 /* Shortcut for single character strings */
7584 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7585 Py_RETURN_TRUE;
7586 }
7587
7588 e = p + PyUnicode_GET_SIZE(self);
7589 for (; p < e; p++) {
7590 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7591 Py_RETURN_FALSE;
7592 }
7593 }
7594 Py_RETURN_TRUE;
7595}
7596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007598 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599\n\
7600Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007601iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
7603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007604unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007606 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607}
7608
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610unicode_length(PyUnicodeObject *self)
7611{
7612 return self->length;
7613}
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007618Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007619done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620
7621static PyObject *
7622unicode_ljust(PyUnicodeObject *self, PyObject *args)
7623{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007624 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007625 Py_UNICODE fillchar = ' ';
7626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007627 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 return NULL;
7629
Tim Peters7a29bd52001-09-12 03:03:31 +00007630 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631 Py_INCREF(self);
7632 return (PyObject*) self;
7633 }
7634
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007635 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636}
7637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007638PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007641Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642
7643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007644unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 return fixup(self, fixlower);
7647}
7648
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007649#define LEFTSTRIP 0
7650#define RIGHTSTRIP 1
7651#define BOTHSTRIP 2
7652
7653/* Arrays indexed by above */
7654static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7655
7656#define STRIPNAME(i) (stripformat[i]+3)
7657
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007658/* externally visible for str.strip(unicode) */
7659PyObject *
7660_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7661{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7663 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7664 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7665 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7666 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007667
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007669
Benjamin Peterson14339b62009-01-31 16:36:08 +00007670 i = 0;
7671 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7673 i++;
7674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 j = len;
7678 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 do {
7680 j--;
7681 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7682 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007684
Benjamin Peterson14339b62009-01-31 16:36:08 +00007685 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 Py_INCREF(self);
7687 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007688 }
7689 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007691}
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007695do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7698 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007699
Benjamin Peterson14339b62009-01-31 16:36:08 +00007700 i = 0;
7701 if (striptype != RIGHTSTRIP) {
7702 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7703 i++;
7704 }
7705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007706
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 j = len;
7708 if (striptype != LEFTSTRIP) {
7709 do {
7710 j--;
7711 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7712 j++;
7713 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007714
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7716 Py_INCREF(self);
7717 return (PyObject*)self;
7718 }
7719 else
7720 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721}
7722
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007723
7724static PyObject *
7725do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7726{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007728
Benjamin Peterson14339b62009-01-31 16:36:08 +00007729 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7730 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007731
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 if (sep != NULL && sep != Py_None) {
7733 if (PyUnicode_Check(sep))
7734 return _PyUnicode_XStrip(self, striptype, sep);
7735 else {
7736 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 "%s arg must be None or str",
7738 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 return NULL;
7740 }
7741 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007742
Benjamin Peterson14339b62009-01-31 16:36:08 +00007743 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007744}
7745
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007749\n\
7750Return a copy of the string S with leading and trailing\n\
7751whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007752If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007753
7754static PyObject *
7755unicode_strip(PyUnicodeObject *self, PyObject *args)
7756{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757 if (PyTuple_GET_SIZE(args) == 0)
7758 return do_strip(self, BOTHSTRIP); /* Common case */
7759 else
7760 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007761}
7762
7763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007766\n\
7767Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007768If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769
7770static PyObject *
7771unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 if (PyTuple_GET_SIZE(args) == 0)
7774 return do_strip(self, LEFTSTRIP); /* Common case */
7775 else
7776 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007777}
7778
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007782\n\
7783Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007784If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007785
7786static PyObject *
7787unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7788{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 if (PyTuple_GET_SIZE(args) == 0)
7790 return do_strip(self, RIGHTSTRIP); /* Common case */
7791 else
7792 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007793}
7794
7795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007797unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
7799 PyUnicodeObject *u;
7800 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007802 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
Georg Brandl222de0f2009-04-12 12:01:50 +00007804 if (len < 1) {
7805 Py_INCREF(unicode_empty);
7806 return (PyObject *)unicode_empty;
7807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
Tim Peters7a29bd52001-09-12 03:03:31 +00007809 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 /* no repeat, return original string */
7811 Py_INCREF(str);
7812 return (PyObject*) str;
7813 }
Tim Peters8f422462000-09-09 06:13:41 +00007814
7815 /* ensure # of chars needed doesn't overflow int and # of bytes
7816 * needed doesn't overflow size_t
7817 */
7818 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007819 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007820 PyErr_SetString(PyExc_OverflowError,
7821 "repeated string is too long");
7822 return NULL;
7823 }
7824 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7825 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7826 PyErr_SetString(PyExc_OverflowError,
7827 "repeated string is too long");
7828 return NULL;
7829 }
7830 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 if (!u)
7832 return NULL;
7833
7834 p = u->str;
7835
Georg Brandl222de0f2009-04-12 12:01:50 +00007836 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007837 Py_UNICODE_FILL(p, str->str[0], len);
7838 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007839 Py_ssize_t done = str->length; /* number of characters copied this far */
7840 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007842 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007843 Py_UNICODE_COPY(p+done, p, n);
7844 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 }
7847
7848 return (PyObject*) u;
7849}
7850
7851PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 PyObject *subobj,
7853 PyObject *replobj,
7854 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855{
7856 PyObject *self;
7857 PyObject *str1;
7858 PyObject *str2;
7859 PyObject *result;
7860
7861 self = PyUnicode_FromObject(obj);
7862 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 str1 = PyUnicode_FromObject(subobj);
7865 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 Py_DECREF(self);
7867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 }
7869 str2 = PyUnicode_FromObject(replobj);
7870 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 Py_DECREF(self);
7872 Py_DECREF(str1);
7873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 }
Tim Petersced69f82003-09-16 20:30:58 +00007875 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 (PyUnicodeObject *)str1,
7877 (PyUnicodeObject *)str2,
7878 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 Py_DECREF(self);
7880 Py_DECREF(str1);
7881 Py_DECREF(str2);
7882 return result;
7883}
7884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007885PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887\n\
7888Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007889old replaced by new. If the optional argument count is\n\
7890given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
7892static PyObject*
7893unicode_replace(PyUnicodeObject *self, PyObject *args)
7894{
7895 PyUnicodeObject *str1;
7896 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007897 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 PyObject *result;
7899
Martin v. Löwis18e16552006-02-15 17:27:45 +00007900 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 return NULL;
7902 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7903 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007906 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(str1);
7908 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
7911 result = replace(self, str1, str2, maxcount);
7912
7913 Py_DECREF(str1);
7914 Py_DECREF(str2);
7915 return result;
7916}
7917
7918static
7919PyObject *unicode_repr(PyObject *unicode)
7920{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007921 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007922 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007923 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7924 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7925
7926 /* XXX(nnorwitz): rather than over-allocating, it would be
7927 better to choose a different scheme. Perhaps scan the
7928 first N-chars of the string and allocate based on that size.
7929 */
7930 /* Initial allocation is based on the longest-possible unichr
7931 escape.
7932
7933 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7934 unichr, so in this case it's the longest unichr escape. In
7935 narrow (UTF-16) builds this is five chars per source unichr
7936 since there are two unichrs in the surrogate pair, so in narrow
7937 (UTF-16) builds it's not the longest unichr escape.
7938
7939 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7940 so in the narrow (UTF-16) build case it's the longest unichr
7941 escape.
7942 */
7943
Walter Dörwald1ab83302007-05-18 17:15:44 +00007944 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007946#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007948#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007950#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007952 if (repr == NULL)
7953 return NULL;
7954
Walter Dörwald1ab83302007-05-18 17:15:44 +00007955 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007956
7957 /* Add quote */
7958 *p++ = (findchar(s, size, '\'') &&
7959 !findchar(s, size, '"')) ? '"' : '\'';
7960 while (size-- > 0) {
7961 Py_UNICODE ch = *s++;
7962
7963 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007964 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007965 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007966 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007967 continue;
7968 }
7969
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007971 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007972 *p++ = '\\';
7973 *p++ = 't';
7974 }
7975 else if (ch == '\n') {
7976 *p++ = '\\';
7977 *p++ = 'n';
7978 }
7979 else if (ch == '\r') {
7980 *p++ = '\\';
7981 *p++ = 'r';
7982 }
7983
7984 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007985 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007986 *p++ = '\\';
7987 *p++ = 'x';
7988 *p++ = hexdigits[(ch >> 4) & 0x000F];
7989 *p++ = hexdigits[ch & 0x000F];
7990 }
7991
Georg Brandl559e5d72008-06-11 18:37:52 +00007992 /* Copy ASCII characters as-is */
7993 else if (ch < 0x7F) {
7994 *p++ = ch;
7995 }
7996
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007998 else {
7999 Py_UCS4 ucs = ch;
8000
8001#ifndef Py_UNICODE_WIDE
8002 Py_UNICODE ch2 = 0;
8003 /* Get code point from surrogate pair */
8004 if (size > 0) {
8005 ch2 = *s;
8006 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008011 size--;
8012 }
8013 }
8014#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008016 (categories Z* and C* except ASCII space)
8017 */
8018 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8019 /* Map 8-bit characters to '\xhh' */
8020 if (ucs <= 0xff) {
8021 *p++ = '\\';
8022 *p++ = 'x';
8023 *p++ = hexdigits[(ch >> 4) & 0x000F];
8024 *p++ = hexdigits[ch & 0x000F];
8025 }
8026 /* Map 21-bit characters to '\U00xxxxxx' */
8027 else if (ucs >= 0x10000) {
8028 *p++ = '\\';
8029 *p++ = 'U';
8030 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8031 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8032 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8033 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8034 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8035 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8036 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8037 *p++ = hexdigits[ucs & 0x0000000F];
8038 }
8039 /* Map 16-bit characters to '\uxxxx' */
8040 else {
8041 *p++ = '\\';
8042 *p++ = 'u';
8043 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8044 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8045 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8046 *p++ = hexdigits[ucs & 0x000F];
8047 }
8048 }
8049 /* Copy characters as-is */
8050 else {
8051 *p++ = ch;
8052#ifndef Py_UNICODE_WIDE
8053 if (ucs >= 0x10000)
8054 *p++ = ch2;
8055#endif
8056 }
8057 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008058 }
8059 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008060 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008061
8062 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008063 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008064 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065}
8066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008067PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069\n\
8070Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008071such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072arguments start and end are interpreted as in slice notation.\n\
8073\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008074Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076static PyObject *
8077unicode_rfind(PyUnicodeObject *self, PyObject *args)
8078{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008079 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008080 Py_ssize_t start;
8081 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008082 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083
Christian Heimes9cd17752007-11-18 19:35:23 +00008084 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086
Thomas Wouters477c8d52006-05-27 19:21:47 +00008087 result = stringlib_rfind_slice(
8088 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8089 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8090 start, end
8091 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092
8093 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008094
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008098PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008101Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102
8103static PyObject *
8104unicode_rindex(PyUnicodeObject *self, PyObject *args)
8105{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008106 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008107 Py_ssize_t start;
8108 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008109 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Christian Heimes9cd17752007-11-18 19:35:23 +00008111 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113
Thomas Wouters477c8d52006-05-27 19:21:47 +00008114 result = stringlib_rfind_slice(
8115 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8116 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8117 start, end
8118 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119
8120 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008121
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 if (result < 0) {
8123 PyErr_SetString(PyExc_ValueError, "substring not found");
8124 return NULL;
8125 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008126 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127}
8128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008129PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008132Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008133done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
8135static PyObject *
8136unicode_rjust(PyUnicodeObject *self, PyObject *args)
8137{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008138 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008139 Py_UNICODE fillchar = ' ';
8140
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008141 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 return NULL;
8143
Tim Peters7a29bd52001-09-12 03:03:31 +00008144 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 Py_INCREF(self);
8146 return (PyObject*) self;
8147 }
8148
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008149 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150}
8151
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 PyObject *sep,
8154 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155{
8156 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 s = PyUnicode_FromObject(s);
8159 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 if (sep != NULL) {
8162 sep = PyUnicode_FromObject(sep);
8163 if (sep == NULL) {
8164 Py_DECREF(s);
8165 return NULL;
8166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 }
8168
8169 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8170
8171 Py_DECREF(s);
8172 Py_XDECREF(sep);
8173 return result;
8174}
8175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008176PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178\n\
8179Return a list of the words in S, using sep as the\n\
8180delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008181splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008182whitespace string is a separator and empty strings are\n\
8183removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184
8185static PyObject*
8186unicode_split(PyUnicodeObject *self, PyObject *args)
8187{
8188 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008189 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190
Martin v. Löwis18e16552006-02-15 17:27:45 +00008191 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 return NULL;
8193
8194 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200}
8201
Thomas Wouters477c8d52006-05-27 19:21:47 +00008202PyObject *
8203PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8204{
8205 PyObject* str_obj;
8206 PyObject* sep_obj;
8207 PyObject* out;
8208
8209 str_obj = PyUnicode_FromObject(str_in);
8210 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008212 sep_obj = PyUnicode_FromObject(sep_in);
8213 if (!sep_obj) {
8214 Py_DECREF(str_obj);
8215 return NULL;
8216 }
8217
8218 out = stringlib_partition(
8219 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8220 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8221 );
8222
8223 Py_DECREF(sep_obj);
8224 Py_DECREF(str_obj);
8225
8226 return out;
8227}
8228
8229
8230PyObject *
8231PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8232{
8233 PyObject* str_obj;
8234 PyObject* sep_obj;
8235 PyObject* out;
8236
8237 str_obj = PyUnicode_FromObject(str_in);
8238 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008240 sep_obj = PyUnicode_FromObject(sep_in);
8241 if (!sep_obj) {
8242 Py_DECREF(str_obj);
8243 return NULL;
8244 }
8245
8246 out = stringlib_rpartition(
8247 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8248 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8249 );
8250
8251 Py_DECREF(sep_obj);
8252 Py_DECREF(str_obj);
8253
8254 return out;
8255}
8256
8257PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008260Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008261the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008262found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008263
8264static PyObject*
8265unicode_partition(PyUnicodeObject *self, PyObject *separator)
8266{
8267 return PyUnicode_Partition((PyObject *)self, separator);
8268}
8269
8270PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008271 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008272\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008273Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008275separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008276
8277static PyObject*
8278unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8279{
8280 return PyUnicode_RPartition((PyObject *)self, separator);
8281}
8282
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008283PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 PyObject *sep,
8285 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008286{
8287 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008289 s = PyUnicode_FromObject(s);
8290 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 if (sep != NULL) {
8293 sep = PyUnicode_FromObject(sep);
8294 if (sep == NULL) {
8295 Py_DECREF(s);
8296 return NULL;
8297 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008298 }
8299
8300 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8301
8302 Py_DECREF(s);
8303 Py_XDECREF(sep);
8304 return result;
8305}
8306
8307PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008309\n\
8310Return a list of the words in S, using sep as the\n\
8311delimiter string, starting at the end of the string and\n\
8312working to the front. If maxsplit is given, at most maxsplit\n\
8313splits are done. If sep is not specified, any whitespace string\n\
8314is a separator.");
8315
8316static PyObject*
8317unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8318{
8319 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008320 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008321
Martin v. Löwis18e16552006-02-15 17:27:45 +00008322 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008323 return NULL;
8324
8325 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008327 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008329 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008331}
8332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008333PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335\n\
8336Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008337Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008338is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339
8340static PyObject*
8341unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8342{
Guido van Rossum86662912000-04-11 15:38:46 +00008343 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344
Guido van Rossum86662912000-04-11 15:38:46 +00008345 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 return NULL;
8347
Guido van Rossum86662912000-04-11 15:38:46 +00008348 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349}
8350
8351static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008352PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
Walter Dörwald346737f2007-05-31 10:44:43 +00008354 if (PyUnicode_CheckExact(self)) {
8355 Py_INCREF(self);
8356 return self;
8357 } else
8358 /* Subtype -- return genuine unicode string with the same value. */
8359 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8360 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361}
8362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008363PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365\n\
8366Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008367and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368
8369static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008370unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 return fixup(self, fixswapcase);
8373}
8374
Georg Brandlceee0772007-11-27 23:48:05 +00008375PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008377\n\
8378Return a translation table usable for str.translate().\n\
8379If there is only one argument, it must be a dictionary mapping Unicode\n\
8380ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008381Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008382If there are two arguments, they must be strings of equal length, and\n\
8383in the resulting dictionary, each character in x will be mapped to the\n\
8384character at the same position in y. If there is a third argument, it\n\
8385must be a string, whose characters will be mapped to None in the result.");
8386
8387static PyObject*
8388unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8389{
8390 PyObject *x, *y = NULL, *z = NULL;
8391 PyObject *new = NULL, *key, *value;
8392 Py_ssize_t i = 0;
8393 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394
Georg Brandlceee0772007-11-27 23:48:05 +00008395 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8396 return NULL;
8397 new = PyDict_New();
8398 if (!new)
8399 return NULL;
8400 if (y != NULL) {
8401 /* x must be a string too, of equal length */
8402 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8403 if (!PyUnicode_Check(x)) {
8404 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8405 "be a string if there is a second argument");
8406 goto err;
8407 }
8408 if (PyUnicode_GET_SIZE(x) != ylen) {
8409 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8410 "arguments must have equal length");
8411 goto err;
8412 }
8413 /* create entries for translating chars in x to those in y */
8414 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008415 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8416 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008417 if (!key || !value)
8418 goto err;
8419 res = PyDict_SetItem(new, key, value);
8420 Py_DECREF(key);
8421 Py_DECREF(value);
8422 if (res < 0)
8423 goto err;
8424 }
8425 /* create entries for deleting chars in z */
8426 if (z != NULL) {
8427 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008428 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008429 if (!key)
8430 goto err;
8431 res = PyDict_SetItem(new, key, Py_None);
8432 Py_DECREF(key);
8433 if (res < 0)
8434 goto err;
8435 }
8436 }
8437 } else {
8438 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008439 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008440 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8441 "to maketrans it must be a dict");
8442 goto err;
8443 }
8444 /* copy entries into the new dict, converting string keys to int keys */
8445 while (PyDict_Next(x, &i, &key, &value)) {
8446 if (PyUnicode_Check(key)) {
8447 /* convert string keys to integer keys */
8448 PyObject *newkey;
8449 if (PyUnicode_GET_SIZE(key) != 1) {
8450 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8451 "table must be of length 1");
8452 goto err;
8453 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008454 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008455 if (!newkey)
8456 goto err;
8457 res = PyDict_SetItem(new, newkey, value);
8458 Py_DECREF(newkey);
8459 if (res < 0)
8460 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008461 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008462 /* just keep integer keys */
8463 if (PyDict_SetItem(new, key, value) < 0)
8464 goto err;
8465 } else {
8466 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8467 "be strings or integers");
8468 goto err;
8469 }
8470 }
8471 }
8472 return new;
8473 err:
8474 Py_DECREF(new);
8475 return NULL;
8476}
8477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008478PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480\n\
8481Return a copy of the string S, where all characters have been mapped\n\
8482through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008483Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008484Unmapped characters are left untouched. Characters mapped to None\n\
8485are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486
8487static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008488unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489{
Georg Brandlceee0772007-11-27 23:48:05 +00008490 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491}
8492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008493PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008496Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497
8498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008499unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 return fixup(self, fixupper);
8502}
8503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008504PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008507Pad a numeric string S with zeros on the left, to fill a field\n\
8508of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509
8510static PyObject *
8511unicode_zfill(PyUnicodeObject *self, PyObject *args)
8512{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008513 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 PyUnicodeObject *u;
8515
Martin v. Löwis18e16552006-02-15 17:27:45 +00008516 Py_ssize_t width;
8517 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 return NULL;
8519
8520 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008521 if (PyUnicode_CheckExact(self)) {
8522 Py_INCREF(self);
8523 return (PyObject*) self;
8524 }
8525 else
8526 return PyUnicode_FromUnicode(
8527 PyUnicode_AS_UNICODE(self),
8528 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 }
8531
8532 fill = width - self->length;
8533
8534 u = pad(self, fill, 0, '0');
8535
Walter Dörwald068325e2002-04-15 13:36:47 +00008536 if (u == NULL)
8537 return NULL;
8538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 if (u->str[fill] == '+' || u->str[fill] == '-') {
8540 /* move sign to beginning of string */
8541 u->str[0] = u->str[fill];
8542 u->str[fill] = '0';
8543 }
8544
8545 return (PyObject*) u;
8546}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
8548#if 0
8549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008550unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Christian Heimes2202f872008-02-06 14:31:34 +00008552 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553}
8554#endif
8555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008556PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008559Return True if S starts with the specified prefix, False otherwise.\n\
8560With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008561With optional end, stop comparing S at that position.\n\
8562prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
8564static PyObject *
8565unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008568 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008570 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008571 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008572 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008574 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8576 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008577 if (PyTuple_Check(subobj)) {
8578 Py_ssize_t i;
8579 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008582 if (substring == NULL)
8583 return NULL;
8584 result = tailmatch(self, substring, start, end, -1);
8585 Py_DECREF(substring);
8586 if (result) {
8587 Py_RETURN_TRUE;
8588 }
8589 }
8590 /* nothing matched */
8591 Py_RETURN_FALSE;
8592 }
8593 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008596 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008598 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599}
8600
8601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008602PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008605Return True if S ends with the specified suffix, False otherwise.\n\
8606With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008607With optional end, stop comparing S at that position.\n\
8608suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610static PyObject *
8611unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008614 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008616 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008617 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008618 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008620 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8622 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008623 if (PyTuple_Check(subobj)) {
8624 Py_ssize_t i;
8625 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8626 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008628 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008630 result = tailmatch(self, substring, start, end, +1);
8631 Py_DECREF(substring);
8632 if (result) {
8633 Py_RETURN_TRUE;
8634 }
8635 }
8636 Py_RETURN_FALSE;
8637 }
8638 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008642 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008644 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645}
8646
Eric Smith8c663262007-08-25 02:26:07 +00008647#include "stringlib/string_format.h"
8648
8649PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008651\n\
8652");
8653
Eric Smith4a7d76d2008-05-30 18:10:19 +00008654static PyObject *
8655unicode__format__(PyObject* self, PyObject* args)
8656{
8657 PyObject *format_spec;
8658
8659 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8660 return NULL;
8661
8662 return _PyUnicode_FormatAdvanced(self,
8663 PyUnicode_AS_UNICODE(format_spec),
8664 PyUnicode_GET_SIZE(format_spec));
8665}
8666
Eric Smith8c663262007-08-25 02:26:07 +00008667PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008669\n\
8670");
8671
8672static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008673unicode__sizeof__(PyUnicodeObject *v)
8674{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008675 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8676 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008677}
8678
8679PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008681
8682static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008683unicode_getnewargs(PyUnicodeObject *v)
8684{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008685 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008686}
8687
8688
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689static PyMethodDef unicode_methods[] = {
8690
8691 /* Order is according to common usage: often used methods should
8692 appear first, since lookup is done sequentially. */
8693
Benjamin Peterson308d6372009-09-18 21:42:35 +00008694 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008695 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8696 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008697 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008698 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8699 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8700 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8701 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8702 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8703 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8704 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008705 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008706 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8707 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8708 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008709 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008710 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8711 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8712 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008713 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008714 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008715 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008716 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008717 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8718 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8719 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8720 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8721 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8722 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8723 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8724 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8725 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8726 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8727 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8728 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8729 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8730 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008731 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008732 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008733 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008734 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008735 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008736 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8737 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008738 {"maketrans", (PyCFunction) unicode_maketrans,
8739 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008740 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008741#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008742 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743#endif
8744
8745#if 0
8746 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008747 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748#endif
8749
Benjamin Peterson14339b62009-01-31 16:36:08 +00008750 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 {NULL, NULL}
8752};
8753
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008754static PyObject *
8755unicode_mod(PyObject *v, PyObject *w)
8756{
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 if (!PyUnicode_Check(v)) {
8758 Py_INCREF(Py_NotImplemented);
8759 return Py_NotImplemented;
8760 }
8761 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008762}
8763
8764static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008765 0, /*nb_add*/
8766 0, /*nb_subtract*/
8767 0, /*nb_multiply*/
8768 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008769};
8770
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008772 (lenfunc) unicode_length, /* sq_length */
8773 PyUnicode_Concat, /* sq_concat */
8774 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8775 (ssizeargfunc) unicode_getitem, /* sq_item */
8776 0, /* sq_slice */
8777 0, /* sq_ass_item */
8778 0, /* sq_ass_slice */
8779 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780};
8781
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008782static PyObject*
8783unicode_subscript(PyUnicodeObject* self, PyObject* item)
8784{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008785 if (PyIndex_Check(item)) {
8786 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008787 if (i == -1 && PyErr_Occurred())
8788 return NULL;
8789 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008790 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008791 return unicode_getitem(self, i);
8792 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008793 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008794 Py_UNICODE* source_buf;
8795 Py_UNICODE* result_buf;
8796 PyObject* result;
8797
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008798 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008800 return NULL;
8801 }
8802
8803 if (slicelength <= 0) {
8804 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008805 } else if (start == 0 && step == 1 && slicelength == self->length &&
8806 PyUnicode_CheckExact(self)) {
8807 Py_INCREF(self);
8808 return (PyObject *)self;
8809 } else if (step == 1) {
8810 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008811 } else {
8812 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008813 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8814 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 if (result_buf == NULL)
8817 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008818
8819 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8820 result_buf[i] = source_buf[cur];
8821 }
Tim Petersced69f82003-09-16 20:30:58 +00008822
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008823 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008824 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008825 return result;
8826 }
8827 } else {
8828 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8829 return NULL;
8830 }
8831}
8832
8833static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008834 (lenfunc)unicode_length, /* mp_length */
8835 (binaryfunc)unicode_subscript, /* mp_subscript */
8836 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008837};
8838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840/* Helpers for PyUnicode_Format() */
8841
8842static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008843getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008845 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 (*p_argidx)++;
8848 if (arglen < 0)
8849 return args;
8850 else
8851 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 }
8853 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 return NULL;
8856}
8857
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008858/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008860static PyObject *
8861formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008863 char *p;
8864 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008866
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 x = PyFloat_AsDouble(v);
8868 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008869 return NULL;
8870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008873
Eric Smith0923d1d2009-04-16 20:16:10 +00008874 p = PyOS_double_to_string(x, type, prec,
8875 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008876 if (p == NULL)
8877 return NULL;
8878 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008879 PyMem_Free(p);
8880 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881}
8882
Tim Peters38fd5b62000-09-21 05:43:11 +00008883static PyObject*
8884formatlong(PyObject *val, int flags, int prec, int type)
8885{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008886 char *buf;
8887 int len;
8888 PyObject *str; /* temporary string object. */
8889 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008890
Benjamin Peterson14339b62009-01-31 16:36:08 +00008891 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8892 if (!str)
8893 return NULL;
8894 result = PyUnicode_FromStringAndSize(buf, len);
8895 Py_DECREF(str);
8896 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008897}
8898
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899static int
8900formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008901 size_t buflen,
8902 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008904 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008905 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 if (PyUnicode_GET_SIZE(v) == 1) {
8907 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8908 buf[1] = '\0';
8909 return 1;
8910 }
8911#ifndef Py_UNICODE_WIDE
8912 if (PyUnicode_GET_SIZE(v) == 2) {
8913 /* Decode a valid surrogate pair */
8914 int c0 = PyUnicode_AS_UNICODE(v)[0];
8915 int c1 = PyUnicode_AS_UNICODE(v)[1];
8916 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8917 0xDC00 <= c1 && c1 <= 0xDFFF) {
8918 buf[0] = c0;
8919 buf[1] = c1;
8920 buf[2] = '\0';
8921 return 2;
8922 }
8923 }
8924#endif
8925 goto onError;
8926 }
8927 else {
8928 /* Integer input truncated to a character */
8929 long x;
8930 x = PyLong_AsLong(v);
8931 if (x == -1 && PyErr_Occurred())
8932 goto onError;
8933
8934 if (x < 0 || x > 0x10ffff) {
8935 PyErr_SetString(PyExc_OverflowError,
8936 "%c arg not in range(0x110000)");
8937 return -1;
8938 }
8939
8940#ifndef Py_UNICODE_WIDE
8941 if (x > 0xffff) {
8942 x -= 0x10000;
8943 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8944 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8945 return 2;
8946 }
8947#endif
8948 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008949 buf[1] = '\0';
8950 return 1;
8951 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008952
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008954 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008956 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957}
8958
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008959/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008961*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008962#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008963
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
8967 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008968 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 int args_owned = 0;
8970 PyUnicodeObject *result = NULL;
8971 PyObject *dict = NULL;
8972 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008973
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 PyErr_BadInternalCall();
8976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 }
8978 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008979 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 fmt = PyUnicode_AS_UNICODE(uformat);
8982 fmtcnt = PyUnicode_GET_SIZE(uformat);
8983
8984 reslen = rescnt = fmtcnt + 100;
8985 result = _PyUnicode_New(reslen);
8986 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 res = PyUnicode_AS_UNICODE(result);
8989
8990 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 arglen = PyTuple_Size(args);
8992 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
8994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 arglen = -1;
8996 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008998 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008999 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
9002 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 if (*fmt != '%') {
9004 if (--rescnt < 0) {
9005 rescnt = fmtcnt + 100;
9006 reslen += rescnt;
9007 if (_PyUnicode_Resize(&result, reslen) < 0)
9008 goto onError;
9009 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9010 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009013 }
9014 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 /* Got a format specifier */
9016 int flags = 0;
9017 Py_ssize_t width = -1;
9018 int prec = -1;
9019 Py_UNICODE c = '\0';
9020 Py_UNICODE fill;
9021 int isnumok;
9022 PyObject *v = NULL;
9023 PyObject *temp = NULL;
9024 Py_UNICODE *pbuf;
9025 Py_UNICODE sign;
9026 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009027 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 fmt++;
9030 if (*fmt == '(') {
9031 Py_UNICODE *keystart;
9032 Py_ssize_t keylen;
9033 PyObject *key;
9034 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 if (dict == NULL) {
9037 PyErr_SetString(PyExc_TypeError,
9038 "format requires a mapping");
9039 goto onError;
9040 }
9041 ++fmt;
9042 --fmtcnt;
9043 keystart = fmt;
9044 /* Skip over balanced parentheses */
9045 while (pcount > 0 && --fmtcnt >= 0) {
9046 if (*fmt == ')')
9047 --pcount;
9048 else if (*fmt == '(')
9049 ++pcount;
9050 fmt++;
9051 }
9052 keylen = fmt - keystart - 1;
9053 if (fmtcnt < 0 || pcount > 0) {
9054 PyErr_SetString(PyExc_ValueError,
9055 "incomplete format key");
9056 goto onError;
9057 }
9058#if 0
9059 /* keys are converted to strings using UTF-8 and
9060 then looked up since Python uses strings to hold
9061 variables names etc. in its namespaces and we
9062 wouldn't want to break common idioms. */
9063 key = PyUnicode_EncodeUTF8(keystart,
9064 keylen,
9065 NULL);
9066#else
9067 key = PyUnicode_FromUnicode(keystart, keylen);
9068#endif
9069 if (key == NULL)
9070 goto onError;
9071 if (args_owned) {
9072 Py_DECREF(args);
9073 args_owned = 0;
9074 }
9075 args = PyObject_GetItem(dict, key);
9076 Py_DECREF(key);
9077 if (args == NULL) {
9078 goto onError;
9079 }
9080 args_owned = 1;
9081 arglen = -1;
9082 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 while (--fmtcnt >= 0) {
9085 switch (c = *fmt++) {
9086 case '-': flags |= F_LJUST; continue;
9087 case '+': flags |= F_SIGN; continue;
9088 case ' ': flags |= F_BLANK; continue;
9089 case '#': flags |= F_ALT; continue;
9090 case '0': flags |= F_ZERO; continue;
9091 }
9092 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 if (c == '*') {
9095 v = getnextarg(args, arglen, &argidx);
9096 if (v == NULL)
9097 goto onError;
9098 if (!PyLong_Check(v)) {
9099 PyErr_SetString(PyExc_TypeError,
9100 "* wants int");
9101 goto onError;
9102 }
9103 width = PyLong_AsLong(v);
9104 if (width == -1 && PyErr_Occurred())
9105 goto onError;
9106 if (width < 0) {
9107 flags |= F_LJUST;
9108 width = -width;
9109 }
9110 if (--fmtcnt >= 0)
9111 c = *fmt++;
9112 }
9113 else if (c >= '0' && c <= '9') {
9114 width = c - '0';
9115 while (--fmtcnt >= 0) {
9116 c = *fmt++;
9117 if (c < '0' || c > '9')
9118 break;
9119 if ((width*10) / 10 != width) {
9120 PyErr_SetString(PyExc_ValueError,
9121 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009122 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 }
9124 width = width*10 + (c - '0');
9125 }
9126 }
9127 if (c == '.') {
9128 prec = 0;
9129 if (--fmtcnt >= 0)
9130 c = *fmt++;
9131 if (c == '*') {
9132 v = getnextarg(args, arglen, &argidx);
9133 if (v == NULL)
9134 goto onError;
9135 if (!PyLong_Check(v)) {
9136 PyErr_SetString(PyExc_TypeError,
9137 "* wants int");
9138 goto onError;
9139 }
9140 prec = PyLong_AsLong(v);
9141 if (prec == -1 && PyErr_Occurred())
9142 goto onError;
9143 if (prec < 0)
9144 prec = 0;
9145 if (--fmtcnt >= 0)
9146 c = *fmt++;
9147 }
9148 else if (c >= '0' && c <= '9') {
9149 prec = c - '0';
9150 while (--fmtcnt >= 0) {
9151 c = Py_CHARMASK(*fmt++);
9152 if (c < '0' || c > '9')
9153 break;
9154 if ((prec*10) / 10 != prec) {
9155 PyErr_SetString(PyExc_ValueError,
9156 "prec too big");
9157 goto onError;
9158 }
9159 prec = prec*10 + (c - '0');
9160 }
9161 }
9162 } /* prec */
9163 if (fmtcnt >= 0) {
9164 if (c == 'h' || c == 'l' || c == 'L') {
9165 if (--fmtcnt >= 0)
9166 c = *fmt++;
9167 }
9168 }
9169 if (fmtcnt < 0) {
9170 PyErr_SetString(PyExc_ValueError,
9171 "incomplete format");
9172 goto onError;
9173 }
9174 if (c != '%') {
9175 v = getnextarg(args, arglen, &argidx);
9176 if (v == NULL)
9177 goto onError;
9178 }
9179 sign = 0;
9180 fill = ' ';
9181 switch (c) {
9182
9183 case '%':
9184 pbuf = formatbuf;
9185 /* presume that buffer length is at least 1 */
9186 pbuf[0] = '%';
9187 len = 1;
9188 break;
9189
9190 case 's':
9191 case 'r':
9192 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009193 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 temp = v;
9195 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 }
9197 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 if (c == 's')
9199 temp = PyObject_Str(v);
9200 else if (c == 'r')
9201 temp = PyObject_Repr(v);
9202 else
9203 temp = PyObject_ASCII(v);
9204 if (temp == NULL)
9205 goto onError;
9206 if (PyUnicode_Check(temp))
9207 /* nothing to do */;
9208 else {
9209 Py_DECREF(temp);
9210 PyErr_SetString(PyExc_TypeError,
9211 "%s argument has non-string str()");
9212 goto onError;
9213 }
9214 }
9215 pbuf = PyUnicode_AS_UNICODE(temp);
9216 len = PyUnicode_GET_SIZE(temp);
9217 if (prec >= 0 && len > prec)
9218 len = prec;
9219 break;
9220
9221 case 'i':
9222 case 'd':
9223 case 'u':
9224 case 'o':
9225 case 'x':
9226 case 'X':
9227 if (c == 'i')
9228 c = 'd';
9229 isnumok = 0;
9230 if (PyNumber_Check(v)) {
9231 PyObject *iobj=NULL;
9232
9233 if (PyLong_Check(v)) {
9234 iobj = v;
9235 Py_INCREF(iobj);
9236 }
9237 else {
9238 iobj = PyNumber_Long(v);
9239 }
9240 if (iobj!=NULL) {
9241 if (PyLong_Check(iobj)) {
9242 isnumok = 1;
9243 temp = formatlong(iobj, flags, prec, c);
9244 Py_DECREF(iobj);
9245 if (!temp)
9246 goto onError;
9247 pbuf = PyUnicode_AS_UNICODE(temp);
9248 len = PyUnicode_GET_SIZE(temp);
9249 sign = 1;
9250 }
9251 else {
9252 Py_DECREF(iobj);
9253 }
9254 }
9255 }
9256 if (!isnumok) {
9257 PyErr_Format(PyExc_TypeError,
9258 "%%%c format: a number is required, "
9259 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9260 goto onError;
9261 }
9262 if (flags & F_ZERO)
9263 fill = '0';
9264 break;
9265
9266 case 'e':
9267 case 'E':
9268 case 'f':
9269 case 'F':
9270 case 'g':
9271 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009272 temp = formatfloat(v, flags, prec, c);
9273 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009275 pbuf = PyUnicode_AS_UNICODE(temp);
9276 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009277 sign = 1;
9278 if (flags & F_ZERO)
9279 fill = '0';
9280 break;
9281
9282 case 'c':
9283 pbuf = formatbuf;
9284 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9285 if (len < 0)
9286 goto onError;
9287 break;
9288
9289 default:
9290 PyErr_Format(PyExc_ValueError,
9291 "unsupported format character '%c' (0x%x) "
9292 "at index %zd",
9293 (31<=c && c<=126) ? (char)c : '?',
9294 (int)c,
9295 (Py_ssize_t)(fmt - 1 -
9296 PyUnicode_AS_UNICODE(uformat)));
9297 goto onError;
9298 }
9299 if (sign) {
9300 if (*pbuf == '-' || *pbuf == '+') {
9301 sign = *pbuf++;
9302 len--;
9303 }
9304 else if (flags & F_SIGN)
9305 sign = '+';
9306 else if (flags & F_BLANK)
9307 sign = ' ';
9308 else
9309 sign = 0;
9310 }
9311 if (width < len)
9312 width = len;
9313 if (rescnt - (sign != 0) < width) {
9314 reslen -= rescnt;
9315 rescnt = width + fmtcnt + 100;
9316 reslen += rescnt;
9317 if (reslen < 0) {
9318 Py_XDECREF(temp);
9319 PyErr_NoMemory();
9320 goto onError;
9321 }
9322 if (_PyUnicode_Resize(&result, reslen) < 0) {
9323 Py_XDECREF(temp);
9324 goto onError;
9325 }
9326 res = PyUnicode_AS_UNICODE(result)
9327 + reslen - rescnt;
9328 }
9329 if (sign) {
9330 if (fill != ' ')
9331 *res++ = sign;
9332 rescnt--;
9333 if (width > len)
9334 width--;
9335 }
9336 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9337 assert(pbuf[0] == '0');
9338 assert(pbuf[1] == c);
9339 if (fill != ' ') {
9340 *res++ = *pbuf++;
9341 *res++ = *pbuf++;
9342 }
9343 rescnt -= 2;
9344 width -= 2;
9345 if (width < 0)
9346 width = 0;
9347 len -= 2;
9348 }
9349 if (width > len && !(flags & F_LJUST)) {
9350 do {
9351 --rescnt;
9352 *res++ = fill;
9353 } while (--width > len);
9354 }
9355 if (fill == ' ') {
9356 if (sign)
9357 *res++ = sign;
9358 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9359 assert(pbuf[0] == '0');
9360 assert(pbuf[1] == c);
9361 *res++ = *pbuf++;
9362 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009363 }
9364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 Py_UNICODE_COPY(res, pbuf, len);
9366 res += len;
9367 rescnt -= len;
9368 while (--width >= len) {
9369 --rescnt;
9370 *res++ = ' ';
9371 }
9372 if (dict && (argidx < arglen) && c != '%') {
9373 PyErr_SetString(PyExc_TypeError,
9374 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009375 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 goto onError;
9377 }
9378 Py_XDECREF(temp);
9379 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 } /* until end */
9381 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 PyErr_SetString(PyExc_TypeError,
9383 "not all arguments converted during string formatting");
9384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 }
9386
Thomas Woutersa96affe2006-03-12 00:29:36 +00009387 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 }
9392 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 return (PyObject *)result;
9394
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 Py_XDECREF(result);
9397 Py_DECREF(uformat);
9398 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 }
9401 return NULL;
9402}
9403
Jeremy Hylton938ace62002-07-17 16:30:39 +00009404static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009405unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9406
Tim Peters6d6c1a32001-08-02 04:15:00 +00009407static PyObject *
9408unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9409{
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009411 static char *kwlist[] = {"object", "encoding", "errors", 0};
9412 char *encoding = NULL;
9413 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009414
Benjamin Peterson14339b62009-01-31 16:36:08 +00009415 if (type != &PyUnicode_Type)
9416 return unicode_subtype_new(type, args, kwds);
9417 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009419 return NULL;
9420 if (x == NULL)
9421 return (PyObject *)_PyUnicode_New(0);
9422 if (encoding == NULL && errors == NULL)
9423 return PyObject_Str(x);
9424 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009426}
9427
Guido van Rossume023fe02001-08-30 03:12:59 +00009428static PyObject *
9429unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9430{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009431 PyUnicodeObject *tmp, *pnew;
9432 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009433
Benjamin Peterson14339b62009-01-31 16:36:08 +00009434 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9435 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9436 if (tmp == NULL)
9437 return NULL;
9438 assert(PyUnicode_Check(tmp));
9439 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9440 if (pnew == NULL) {
9441 Py_DECREF(tmp);
9442 return NULL;
9443 }
9444 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9445 if (pnew->str == NULL) {
9446 _Py_ForgetReference((PyObject *)pnew);
9447 PyObject_Del(pnew);
9448 Py_DECREF(tmp);
9449 return PyErr_NoMemory();
9450 }
9451 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9452 pnew->length = n;
9453 pnew->hash = tmp->hash;
9454 Py_DECREF(tmp);
9455 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009456}
9457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009458PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009460\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009461Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009462encoding defaults to the current default string encoding.\n\
9463errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009464
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009465static PyObject *unicode_iter(PyObject *seq);
9466
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009468 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009469 "str", /* tp_name */
9470 sizeof(PyUnicodeObject), /* tp_size */
9471 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009473 (destructor)unicode_dealloc, /* tp_dealloc */
9474 0, /* tp_print */
9475 0, /* tp_getattr */
9476 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009477 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009478 unicode_repr, /* tp_repr */
9479 &unicode_as_number, /* tp_as_number */
9480 &unicode_as_sequence, /* tp_as_sequence */
9481 &unicode_as_mapping, /* tp_as_mapping */
9482 (hashfunc) unicode_hash, /* tp_hash*/
9483 0, /* tp_call*/
9484 (reprfunc) unicode_str, /* tp_str */
9485 PyObject_GenericGetAttr, /* tp_getattro */
9486 0, /* tp_setattro */
9487 0, /* tp_as_buffer */
9488 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009490 unicode_doc, /* tp_doc */
9491 0, /* tp_traverse */
9492 0, /* tp_clear */
9493 PyUnicode_RichCompare, /* tp_richcompare */
9494 0, /* tp_weaklistoffset */
9495 unicode_iter, /* tp_iter */
9496 0, /* tp_iternext */
9497 unicode_methods, /* tp_methods */
9498 0, /* tp_members */
9499 0, /* tp_getset */
9500 &PyBaseObject_Type, /* tp_base */
9501 0, /* tp_dict */
9502 0, /* tp_descr_get */
9503 0, /* tp_descr_set */
9504 0, /* tp_dictoffset */
9505 0, /* tp_init */
9506 0, /* tp_alloc */
9507 unicode_new, /* tp_new */
9508 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509};
9510
9511/* Initialize the Unicode implementation */
9512
Thomas Wouters78890102000-07-22 19:25:51 +00009513void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009515 int i;
9516
Thomas Wouters477c8d52006-05-27 19:21:47 +00009517 /* XXX - move this array to unicodectype.c ? */
9518 Py_UNICODE linebreak[] = {
9519 0x000A, /* LINE FEED */
9520 0x000D, /* CARRIAGE RETURN */
9521 0x001C, /* FILE SEPARATOR */
9522 0x001D, /* GROUP SEPARATOR */
9523 0x001E, /* RECORD SEPARATOR */
9524 0x0085, /* NEXT LINE */
9525 0x2028, /* LINE SEPARATOR */
9526 0x2029, /* PARAGRAPH SEPARATOR */
9527 };
9528
Fred Drakee4315f52000-05-09 19:53:39 +00009529 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009530 free_list = NULL;
9531 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009533 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009535
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009536 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009538 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009540
9541 /* initialize the linebreak bloom filter */
9542 bloom_linebreak = make_bloom_mask(
9543 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9544 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009545
9546 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547}
9548
9549/* Finalize the Unicode implementation */
9550
Christian Heimesa156e092008-02-16 07:38:31 +00009551int
9552PyUnicode_ClearFreeList(void)
9553{
9554 int freelist_size = numfree;
9555 PyUnicodeObject *u;
9556
9557 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 PyUnicodeObject *v = u;
9559 u = *(PyUnicodeObject **)u;
9560 if (v->str)
9561 PyObject_DEL(v->str);
9562 Py_XDECREF(v->defenc);
9563 PyObject_Del(v);
9564 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009565 }
9566 free_list = NULL;
9567 assert(numfree == 0);
9568 return freelist_size;
9569}
9570
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571void
Thomas Wouters78890102000-07-22 19:25:51 +00009572_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009574 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009576 Py_XDECREF(unicode_empty);
9577 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009578
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009579 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 if (unicode_latin1[i]) {
9581 Py_DECREF(unicode_latin1[i]);
9582 unicode_latin1[i] = NULL;
9583 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009584 }
Christian Heimesa156e092008-02-16 07:38:31 +00009585 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009587
Walter Dörwald16807132007-05-25 13:52:07 +00009588void
9589PyUnicode_InternInPlace(PyObject **p)
9590{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009591 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9592 PyObject *t;
9593 if (s == NULL || !PyUnicode_Check(s))
9594 Py_FatalError(
9595 "PyUnicode_InternInPlace: unicode strings only please!");
9596 /* If it's a subclass, we don't really know what putting
9597 it in the interned dict might do. */
9598 if (!PyUnicode_CheckExact(s))
9599 return;
9600 if (PyUnicode_CHECK_INTERNED(s))
9601 return;
9602 if (interned == NULL) {
9603 interned = PyDict_New();
9604 if (interned == NULL) {
9605 PyErr_Clear(); /* Don't leave an exception */
9606 return;
9607 }
9608 }
9609 /* It might be that the GetItem call fails even
9610 though the key is present in the dictionary,
9611 namely when this happens during a stack overflow. */
9612 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009614 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009615
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 if (t) {
9617 Py_INCREF(t);
9618 Py_DECREF(*p);
9619 *p = t;
9620 return;
9621 }
Walter Dörwald16807132007-05-25 13:52:07 +00009622
Benjamin Peterson14339b62009-01-31 16:36:08 +00009623 PyThreadState_GET()->recursion_critical = 1;
9624 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9625 PyErr_Clear();
9626 PyThreadState_GET()->recursion_critical = 0;
9627 return;
9628 }
9629 PyThreadState_GET()->recursion_critical = 0;
9630 /* The two references in interned are not counted by refcnt.
9631 The deallocator will take care of this */
9632 Py_REFCNT(s) -= 2;
9633 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009634}
9635
9636void
9637PyUnicode_InternImmortal(PyObject **p)
9638{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009639 PyUnicode_InternInPlace(p);
9640 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9641 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9642 Py_INCREF(*p);
9643 }
Walter Dörwald16807132007-05-25 13:52:07 +00009644}
9645
9646PyObject *
9647PyUnicode_InternFromString(const char *cp)
9648{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009649 PyObject *s = PyUnicode_FromString(cp);
9650 if (s == NULL)
9651 return NULL;
9652 PyUnicode_InternInPlace(&s);
9653 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009654}
9655
9656void _Py_ReleaseInternedUnicodeStrings(void)
9657{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009658 PyObject *keys;
9659 PyUnicodeObject *s;
9660 Py_ssize_t i, n;
9661 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009662
Benjamin Peterson14339b62009-01-31 16:36:08 +00009663 if (interned == NULL || !PyDict_Check(interned))
9664 return;
9665 keys = PyDict_Keys(interned);
9666 if (keys == NULL || !PyList_Check(keys)) {
9667 PyErr_Clear();
9668 return;
9669 }
Walter Dörwald16807132007-05-25 13:52:07 +00009670
Benjamin Peterson14339b62009-01-31 16:36:08 +00009671 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9672 detector, interned unicode strings are not forcibly deallocated;
9673 rather, we give them their stolen references back, and then clear
9674 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009675
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676 n = PyList_GET_SIZE(keys);
9677 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009679 for (i = 0; i < n; i++) {
9680 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9681 switch (s->state) {
9682 case SSTATE_NOT_INTERNED:
9683 /* XXX Shouldn't happen */
9684 break;
9685 case SSTATE_INTERNED_IMMORTAL:
9686 Py_REFCNT(s) += 1;
9687 immortal_size += s->length;
9688 break;
9689 case SSTATE_INTERNED_MORTAL:
9690 Py_REFCNT(s) += 2;
9691 mortal_size += s->length;
9692 break;
9693 default:
9694 Py_FatalError("Inconsistent interned string state.");
9695 }
9696 s->state = SSTATE_NOT_INTERNED;
9697 }
9698 fprintf(stderr, "total size of all interned strings: "
9699 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9700 "mortal/immortal\n", mortal_size, immortal_size);
9701 Py_DECREF(keys);
9702 PyDict_Clear(interned);
9703 Py_DECREF(interned);
9704 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009705}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009706
9707
9708/********************* Unicode Iterator **************************/
9709
9710typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009711 PyObject_HEAD
9712 Py_ssize_t it_index;
9713 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009714} unicodeiterobject;
9715
9716static void
9717unicodeiter_dealloc(unicodeiterobject *it)
9718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009719 _PyObject_GC_UNTRACK(it);
9720 Py_XDECREF(it->it_seq);
9721 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009722}
9723
9724static int
9725unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9726{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009727 Py_VISIT(it->it_seq);
9728 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009729}
9730
9731static PyObject *
9732unicodeiter_next(unicodeiterobject *it)
9733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 PyUnicodeObject *seq;
9735 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009736
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 assert(it != NULL);
9738 seq = it->it_seq;
9739 if (seq == NULL)
9740 return NULL;
9741 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009742
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9744 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009746 if (item != NULL)
9747 ++it->it_index;
9748 return item;
9749 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009750
Benjamin Peterson14339b62009-01-31 16:36:08 +00009751 Py_DECREF(seq);
9752 it->it_seq = NULL;
9753 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009754}
9755
9756static PyObject *
9757unicodeiter_len(unicodeiterobject *it)
9758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009759 Py_ssize_t len = 0;
9760 if (it->it_seq)
9761 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9762 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009763}
9764
9765PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9766
9767static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009768 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009771};
9772
9773PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009774 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9775 "str_iterator", /* tp_name */
9776 sizeof(unicodeiterobject), /* tp_basicsize */
9777 0, /* tp_itemsize */
9778 /* methods */
9779 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9780 0, /* tp_print */
9781 0, /* tp_getattr */
9782 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009783 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009784 0, /* tp_repr */
9785 0, /* tp_as_number */
9786 0, /* tp_as_sequence */
9787 0, /* tp_as_mapping */
9788 0, /* tp_hash */
9789 0, /* tp_call */
9790 0, /* tp_str */
9791 PyObject_GenericGetAttr, /* tp_getattro */
9792 0, /* tp_setattro */
9793 0, /* tp_as_buffer */
9794 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9795 0, /* tp_doc */
9796 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9797 0, /* tp_clear */
9798 0, /* tp_richcompare */
9799 0, /* tp_weaklistoffset */
9800 PyObject_SelfIter, /* tp_iter */
9801 (iternextfunc)unicodeiter_next, /* tp_iternext */
9802 unicodeiter_methods, /* tp_methods */
9803 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009804};
9805
9806static PyObject *
9807unicode_iter(PyObject *seq)
9808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009809 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009810
Benjamin Peterson14339b62009-01-31 16:36:08 +00009811 if (!PyUnicode_Check(seq)) {
9812 PyErr_BadInternalCall();
9813 return NULL;
9814 }
9815 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9816 if (it == NULL)
9817 return NULL;
9818 it->it_index = 0;
9819 Py_INCREF(seq);
9820 it->it_seq = (PyUnicodeObject *)seq;
9821 _PyObject_GC_TRACK(it);
9822 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009823}
9824
Martin v. Löwis5b222132007-06-10 09:51:05 +00009825size_t
9826Py_UNICODE_strlen(const Py_UNICODE *u)
9827{
9828 int res = 0;
9829 while(*u++)
9830 res++;
9831 return res;
9832}
9833
9834Py_UNICODE*
9835Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9836{
9837 Py_UNICODE *u = s1;
9838 while ((*u++ = *s2++));
9839 return s1;
9840}
9841
9842Py_UNICODE*
9843Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9844{
9845 Py_UNICODE *u = s1;
9846 while ((*u++ = *s2++))
9847 if (n-- == 0)
9848 break;
9849 return s1;
9850}
9851
9852int
9853Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9854{
9855 while (*s1 && *s2 && *s1 == *s2)
9856 s1++, s2++;
9857 if (*s1 && *s2)
9858 return (*s1 < *s2) ? -1 : +1;
9859 if (*s1)
9860 return 1;
9861 if (*s2)
9862 return -1;
9863 return 0;
9864}
9865
9866Py_UNICODE*
9867Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9868{
9869 const Py_UNICODE *p;
9870 for (p = s; *p; p++)
9871 if (*p == c)
9872 return (Py_UNICODE*)p;
9873 return NULL;
9874}
9875
9876
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009877#ifdef __cplusplus
9878}
9879#endif