blob: 43c827f981bc07a4a437b56b333dac3e7aa5a576 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Georg Brandl952867a2010-06-27 10:17:12 +00001276 "coercing to str: need bytes, bytearray or char buffer, "
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001297 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1298 1 on success. */
1299static int
1300normalize_encoding(const char *encoding,
1301 char *lower,
1302 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001305 char *l;
1306 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 e = encoding;
1309 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001310 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001311 while (*e) {
1312 if (l == l_end)
1313 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001314 if (ISUPPER(*e)) {
1315 *l++ = TOLOWER(*e++);
1316 }
1317 else if (*e == '_') {
1318 *l++ = '-';
1319 e++;
1320 }
1321 else {
1322 *l++ = *e++;
1323 }
1324 }
1325 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001326 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327}
1328
1329PyObject *PyUnicode_Decode(const char *s,
1330 Py_ssize_t size,
1331 const char *encoding,
1332 const char *errors)
1333{
1334 PyObject *buffer = NULL, *unicode;
1335 Py_buffer info;
1336 char lower[11]; /* Enough for any encoding shortcut */
1337
1338 if (encoding == NULL)
1339 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001340
1341 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001342 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1343 if (strcmp(lower, "utf-8") == 0)
1344 return PyUnicode_DecodeUTF8(s, size, errors);
1345 else if ((strcmp(lower, "latin-1") == 0) ||
1346 (strcmp(lower, "iso-8859-1") == 0))
1347 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001349 else if (strcmp(lower, "mbcs") == 0)
1350 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001351#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001352 else if (strcmp(lower, "ascii") == 0)
1353 return PyUnicode_DecodeASCII(s, size, errors);
1354 else if (strcmp(lower, "utf-16") == 0)
1355 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1356 else if (strcmp(lower, "utf-32") == 0)
1357 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001361 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001362 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001363 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001364 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (buffer == NULL)
1366 goto onError;
1367 unicode = PyCodec_Decode(buffer, encoding, errors);
1368 if (unicode == NULL)
1369 goto onError;
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001372 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 Py_DECREF(unicode);
1375 goto onError;
1376 }
1377 Py_DECREF(buffer);
1378 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_XDECREF(buffer);
1382 return NULL;
1383}
1384
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1386 const char *encoding,
1387 const char *errors)
1388{
1389 PyObject *v;
1390
1391 if (!PyUnicode_Check(unicode)) {
1392 PyErr_BadArgument();
1393 goto onError;
1394 }
1395
1396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001398
1399 /* Decode via the codec registry */
1400 v = PyCodec_Decode(unicode, encoding, errors);
1401 if (v == NULL)
1402 goto onError;
1403 return v;
1404
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001406 return NULL;
1407}
1408
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 if (!PyUnicode_Check(v)) {
1428 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001429 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001430 Py_TYPE(v)->tp_name);
1431 Py_DECREF(v);
1432 goto onError;
1433 }
1434 return v;
1435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001437 return NULL;
1438}
1439
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 Py_ssize_t size,
1442 const char *encoding,
1443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 unicode = PyUnicode_FromUnicode(s, size);
1448 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1451 Py_DECREF(unicode);
1452 return v;
1453}
1454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 return v;
1474
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001476 return NULL;
1477}
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1480{
Victor Stinner313a1202010-06-11 23:56:51 +00001481 if (Py_FileSystemDefaultEncoding) {
1482#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1483 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1484 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL);
1487#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001488 return PyUnicode_AsEncodedString(unicode,
1489 Py_FileSystemDefaultEncoding,
1490 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001491 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001492 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1493 PyUnicode_GET_SIZE(unicode),
1494 "surrogateescape");
1495}
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1498 const char *encoding,
1499 const char *errors)
1500{
1501 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001502 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001503
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 }
Fred Drakee4315f52000-05-09 19:53:39 +00001508
Tim Petersced69f82003-09-16 20:30:58 +00001509 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001511
1512 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001513 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1514 if (strcmp(lower, "utf-8") == 0)
1515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
1518 else if ((strcmp(lower, "latin-1") == 0) ||
1519 (strcmp(lower, "iso-8859-1") == 0))
1520 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001523#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001524 else if (strcmp(lower, "mbcs") == 0)
1525 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1526 PyUnicode_GET_SIZE(unicode),
1527 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001528#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001529 else if (strcmp(lower, "ascii") == 0)
1530 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1531 PyUnicode_GET_SIZE(unicode),
1532 errors);
1533 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 /* During bootstrap, we may need to find the encodings
1535 package, to load the file system encoding, and require the
1536 file system encoding in order to load the encodings
1537 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001538
Victor Stinner59e62db2010-05-15 13:14:32 +00001539 Break out of this dependency by assuming that the path to
1540 the encodings module is ASCII-only. XXX could try wcstombs
1541 instead, if the file system encoding is the locale's
1542 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001543 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001544 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1545 !PyThreadState_GET()->interp->codecs_initialized)
1546 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1547 PyUnicode_GET_SIZE(unicode),
1548 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Encode via the codec registry */
1551 v = PyCodec_Encode(unicode, encoding, errors);
1552 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001553 return NULL;
1554
1555 /* The normal path */
1556 if (PyBytes_Check(v))
1557 return v;
1558
1559 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560 if (PyByteArray_Check(v)) {
1561 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001562 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 PyOS_snprintf(msg, sizeof(msg),
1564 "encoder %s returned buffer instead of bytes",
1565 encoding);
1566 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001567 Py_DECREF(v);
1568 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001569 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001571 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1572 Py_DECREF(v);
1573 return b;
1574 }
1575
1576 PyErr_Format(PyExc_TypeError,
1577 "encoder did not return a bytes object (type=%.400s)",
1578 Py_TYPE(v)->tp_name);
1579 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001580 return NULL;
1581}
1582
1583PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1584 const char *encoding,
1585 const char *errors)
1586{
1587 PyObject *v;
1588
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 goto onError;
1592 }
1593
1594 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001596
1597 /* Encode via the codec registry */
1598 v = PyCodec_Encode(unicode, encoding, errors);
1599 if (v == NULL)
1600 goto onError;
1601 if (!PyUnicode_Check(v)) {
1602 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001603 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001604 Py_TYPE(v)->tp_name);
1605 Py_DECREF(v);
1606 goto onError;
1607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001609
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 return NULL;
1612}
1613
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001614PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001616{
1617 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 if (v)
1619 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001620 if (errors != NULL)
1621 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001622 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001623 PyUnicode_GET_SIZE(unicode),
1624 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001625 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001626 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001627 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001628 return v;
1629}
1630
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001631PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001632PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001633 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001634 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1635}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001636
Christian Heimes5894ba72007-11-04 11:43:14 +00001637PyObject*
1638PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1639{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001640 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1641 can be undefined. If it is case, decode using UTF-8. The following assumes
1642 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1643 bootstrapping process where the codecs aren't ready yet.
1644 */
1645 if (Py_FileSystemDefaultEncoding) {
1646#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001647 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001648 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001651 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001652 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001653 }
1654#endif
1655 return PyUnicode_Decode(s, size,
1656 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001657 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001658 }
1659 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001660 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001661 }
1662}
1663
Martin v. Löwis011e8422009-05-05 04:43:17 +00001664/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001665 system encoding. The addr param must be a PyObject**.
1666 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001667
1668int
1669PyUnicode_FSConverter(PyObject* arg, void* addr)
1670{
1671 PyObject *output = NULL;
1672 Py_ssize_t size;
1673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001674 if (arg == NULL) {
1675 Py_DECREF(*(PyObject**)addr);
1676 return 1;
1677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001679 output = arg;
1680 Py_INCREF(output);
1681 }
1682 else {
1683 arg = PyUnicode_FromObject(arg);
1684 if (!arg)
1685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001687 Py_DECREF(arg);
1688 if (!output)
1689 return 0;
1690 if (!PyBytes_Check(output)) {
1691 Py_DECREF(output);
1692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1693 return 0;
1694 }
1695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001696 size = PyBytes_GET_SIZE(output);
1697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001698 if (size != strlen(data)) {
1699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1700 Py_DECREF(output);
1701 return 0;
1702 }
1703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001705}
1706
1707
Martin v. Löwis5b222132007-06-10 09:51:05 +00001708char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001709_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001710{
Christian Heimesf3863112007-11-22 07:46:41 +00001711 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001712 if (!PyUnicode_Check(unicode)) {
1713 PyErr_BadArgument();
1714 return NULL;
1715 }
Christian Heimesf3863112007-11-22 07:46:41 +00001716 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1717 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001718 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001719 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001720 *psize = PyBytes_GET_SIZE(bytes);
1721 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001722}
1723
1724char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001725_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001726{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001727 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001728}
1729
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1731{
1732 if (!PyUnicode_Check(unicode)) {
1733 PyErr_BadArgument();
1734 goto onError;
1735 }
1736 return PyUnicode_AS_UNICODE(unicode);
1737
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return NULL;
1740}
1741
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
1744 if (!PyUnicode_Check(unicode)) {
1745 PyErr_BadArgument();
1746 goto onError;
1747 }
1748 return PyUnicode_GET_SIZE(unicode);
1749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return -1;
1752}
1753
Thomas Wouters78890102000-07-22 19:25:51 +00001754const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001755{
1756 return unicode_default_encoding;
1757}
1758
1759int PyUnicode_SetDefaultEncoding(const char *encoding)
1760{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001761 if (strcmp(encoding, unicode_default_encoding) != 0) {
1762 PyErr_Format(PyExc_ValueError,
1763 "Can only set default encoding to %s",
1764 unicode_default_encoding);
1765 return -1;
1766 }
Fred Drakee4315f52000-05-09 19:53:39 +00001767 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001768}
1769
Victor Stinner554f3f02010-06-16 23:33:54 +00001770/* create or adjust a UnicodeDecodeError */
1771static void
1772make_decode_exception(PyObject **exceptionObject,
1773 const char *encoding,
1774 const char *input, Py_ssize_t length,
1775 Py_ssize_t startpos, Py_ssize_t endpos,
1776 const char *reason)
1777{
1778 if (*exceptionObject == NULL) {
1779 *exceptionObject = PyUnicodeDecodeError_Create(
1780 encoding, input, length, startpos, endpos, reason);
1781 }
1782 else {
1783 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1784 goto onError;
1785 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1786 goto onError;
1787 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1788 goto onError;
1789 }
1790 return;
1791
1792onError:
1793 Py_DECREF(*exceptionObject);
1794 *exceptionObject = NULL;
1795}
1796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797/* error handling callback helper:
1798 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001799 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 and adjust various state variables.
1801 return 0 on success, -1 on error
1802*/
1803
1804static
1805int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001806 const char *encoding, const char *reason,
1807 const char **input, const char **inend, Py_ssize_t *startinpos,
1808 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1809 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001811 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812
1813 PyObject *restuple = NULL;
1814 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001815 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001817 Py_ssize_t requiredsize;
1818 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001820 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001821 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 int res = -1;
1823
1824 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 *errorHandler = PyCodec_LookupError(errors);
1826 if (*errorHandler == NULL)
1827 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 }
1829
Victor Stinner554f3f02010-06-16 23:33:54 +00001830 make_decode_exception(exceptionObject,
1831 encoding,
1832 *input, *inend - *input,
1833 *startinpos, *endinpos,
1834 reason);
1835 if (*exceptionObject == NULL)
1836 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837
1838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1839 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001842 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001843 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 }
1845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001847
1848 /* Copy back the bytes variables, which might have been modified by the
1849 callback */
1850 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1851 if (!inputobj)
1852 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001853 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001855 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001856 *input = PyBytes_AS_STRING(inputobj);
1857 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001858 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001859 /* we can DECREF safely, as the exception has another reference,
1860 so the object won't go away. */
1861 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001865 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1867 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001868 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869
1870 /* need more space? (at least enough for what we
1871 have+the replacement+the rest of the string (starting
1872 at the new input position), so we won't have to check space
1873 when there are no errors in the rest of the string) */
1874 repptr = PyUnicode_AS_UNICODE(repunicode);
1875 repsize = PyUnicode_GET_SIZE(repunicode);
1876 requiredsize = *outpos + repsize + insize-newpos;
1877 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001878 if (requiredsize<2*outsize)
1879 requiredsize = 2*outsize;
1880 if (_PyUnicode_Resize(output, requiredsize) < 0)
1881 goto onError;
1882 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 }
1884 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001885 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 Py_UNICODE_COPY(*outptr, repptr, repsize);
1887 *outptr += repsize;
1888 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001889
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 /* we made it! */
1891 res = 0;
1892
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 Py_XDECREF(restuple);
1895 return res;
1896}
1897
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001898/* --- UTF-7 Codec -------------------------------------------------------- */
1899
Antoine Pitrou244651a2009-05-04 18:56:13 +00001900/* See RFC2152 for details. We encode conservatively and decode liberally. */
1901
1902/* Three simple macros defining base-64. */
1903
1904/* Is c a base-64 character? */
1905
1906#define IS_BASE64(c) \
1907 (((c) >= 'A' && (c) <= 'Z') || \
1908 ((c) >= 'a' && (c) <= 'z') || \
1909 ((c) >= '0' && (c) <= '9') || \
1910 (c) == '+' || (c) == '/')
1911
1912/* given that c is a base-64 character, what is its base-64 value? */
1913
1914#define FROM_BASE64(c) \
1915 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1916 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1917 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1918 (c) == '+' ? 62 : 63)
1919
1920/* What is the base-64 character of the bottom 6 bits of n? */
1921
1922#define TO_BASE64(n) \
1923 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1924
1925/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1926 * decoded as itself. We are permissive on decoding; the only ASCII
1927 * byte not decoding to itself is the + which begins a base64
1928 * string. */
1929
1930#define DECODE_DIRECT(c) \
1931 ((c) <= 127 && (c) != '+')
1932
1933/* The UTF-7 encoder treats ASCII characters differently according to
1934 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1935 * the above). See RFC2152. This array identifies these different
1936 * sets:
1937 * 0 : "Set D"
1938 * alphanumeric and '(),-./:?
1939 * 1 : "Set O"
1940 * !"#$%&*;<=>@[]^_`{|}
1941 * 2 : "whitespace"
1942 * ht nl cr sp
1943 * 3 : special (must be base64 encoded)
1944 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1945 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001946
Tim Petersced69f82003-09-16 20:30:58 +00001947static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001948char utf7_category[128] = {
1949/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1950 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1951/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1952 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1953/* sp ! " # $ % & ' ( ) * + , - . / */
1954 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1955/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1957/* @ A B C D E F G H I J K L M N O */
1958 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1959/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1961/* ` a b c d e f g h i j k l m n o */
1962 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1963/* p q r s t u v w x y z { | } ~ del */
1964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965};
1966
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967/* ENCODE_DIRECT: this character should be encoded as itself. The
1968 * answer depends on whether we are encoding set O as itself, and also
1969 * on whether we are encoding whitespace as itself. RFC2152 makes it
1970 * clear that the answers to these questions vary between
1971 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001972
Antoine Pitrou244651a2009-05-04 18:56:13 +00001973#define ENCODE_DIRECT(c, directO, directWS) \
1974 ((c) < 128 && (c) > 0 && \
1975 ((utf7_category[(c)] == 0) || \
1976 (directWS && (utf7_category[(c)] == 2)) || \
1977 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001979PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001980 Py_ssize_t size,
1981 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001982{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001983 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1984}
1985
Antoine Pitrou244651a2009-05-04 18:56:13 +00001986/* The decoder. The only state we preserve is our read position,
1987 * i.e. how many characters we have consumed. So if we end in the
1988 * middle of a shift sequence we have to back off the read position
1989 * and the output to the beginning of the sequence, otherwise we lose
1990 * all the shift state (seen bits, number of bits seen, high
1991 * surrogate). */
1992
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001993PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001994 Py_ssize_t size,
1995 const char *errors,
1996 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001997{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001998 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t startinpos;
2000 Py_ssize_t endinpos;
2001 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002002 const char *e;
2003 PyUnicodeObject *unicode;
2004 Py_UNICODE *p;
2005 const char *errmsg = "";
2006 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002007 Py_UNICODE *shiftOutStart;
2008 unsigned int base64bits = 0;
2009 unsigned long base64buffer = 0;
2010 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 PyObject *errorHandler = NULL;
2012 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002013
2014 unicode = _PyUnicode_New(size);
2015 if (!unicode)
2016 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 if (size == 0) {
2018 if (consumed)
2019 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002020 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002021 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002022
2023 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 e = s + size;
2026
2027 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002030 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002031
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 if (inShift) { /* in a base-64 section */
2033 if (IS_BASE64(ch)) { /* consume a base-64 character */
2034 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2035 base64bits += 6;
2036 s++;
2037 if (base64bits >= 16) {
2038 /* we have enough bits for a UTF-16 value */
2039 Py_UNICODE outCh = (Py_UNICODE)
2040 (base64buffer >> (base64bits-16));
2041 base64bits -= 16;
2042 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2043 if (surrogate) {
2044 /* expecting a second surrogate */
2045 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2046#ifdef Py_UNICODE_WIDE
2047 *p++ = (((surrogate & 0x3FF)<<10)
2048 | (outCh & 0x3FF)) + 0x10000;
2049#else
2050 *p++ = surrogate;
2051 *p++ = outCh;
2052#endif
2053 surrogate = 0;
2054 }
2055 else {
2056 surrogate = 0;
2057 errmsg = "second surrogate missing";
2058 goto utf7Error;
2059 }
2060 }
2061 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2062 /* first surrogate */
2063 surrogate = outCh;
2064 }
2065 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2066 errmsg = "unexpected second surrogate";
2067 goto utf7Error;
2068 }
2069 else {
2070 *p++ = outCh;
2071 }
2072 }
2073 }
2074 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002075 inShift = 0;
2076 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002077 if (surrogate) {
2078 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002079 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002081 if (base64bits > 0) { /* left-over bits */
2082 if (base64bits >= 6) {
2083 /* We've seen at least one base-64 character */
2084 errmsg = "partial character in shift sequence";
2085 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002086 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002087 else {
2088 /* Some bits remain; they should be zero */
2089 if (base64buffer != 0) {
2090 errmsg = "non-zero padding bits in shift sequence";
2091 goto utf7Error;
2092 }
2093 }
2094 }
2095 if (ch != '-') {
2096 /* '-' is absorbed; other terminating
2097 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002098 *p++ = ch;
2099 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002100 }
2101 }
2102 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002104 s++; /* consume '+' */
2105 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002106 s++;
2107 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002108 }
2109 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002111 shiftOutStart = p;
2112 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002113 }
2114 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002115 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 *p++ = ch;
2117 s++;
2118 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002119 else {
2120 startinpos = s-starts;
2121 s++;
2122 errmsg = "unexpected special character";
2123 goto utf7Error;
2124 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002126utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 outpos = p-PyUnicode_AS_UNICODE(unicode);
2128 endinpos = s-starts;
2129 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002130 errors, &errorHandler,
2131 "utf7", errmsg,
2132 &starts, &e, &startinpos, &endinpos, &exc, &s,
2133 &unicode, &outpos, &p))
2134 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 }
2136
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 /* end of string */
2138
2139 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2140 /* if we're in an inconsistent state, that's an error */
2141 if (surrogate ||
2142 (base64bits >= 6) ||
2143 (base64bits > 0 && base64buffer != 0)) {
2144 outpos = p-PyUnicode_AS_UNICODE(unicode);
2145 endinpos = size;
2146 if (unicode_decode_call_errorhandler(
2147 errors, &errorHandler,
2148 "utf7", "unterminated shift sequence",
2149 &starts, &e, &startinpos, &endinpos, &exc, &s,
2150 &unicode, &outpos, &p))
2151 goto onError;
2152 if (s < e)
2153 goto restart;
2154 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156
2157 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002158 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159 if (inShift) {
2160 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002161 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162 }
2163 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002164 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002168 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 goto onError;
2170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 Py_XDECREF(errorHandler);
2172 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002173 return (PyObject *)unicode;
2174
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 Py_XDECREF(errorHandler);
2177 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002178 Py_DECREF(unicode);
2179 return NULL;
2180}
2181
2182
2183PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002184 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002185 int base64SetO,
2186 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002187 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002188{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002189 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002191 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002192 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002193 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002194 unsigned int base64bits = 0;
2195 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196 char * out;
2197 char * start;
2198
2199 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002200 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002201
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002202 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002203 return PyErr_NoMemory();
2204
Antoine Pitrou244651a2009-05-04 18:56:13 +00002205 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 if (v == NULL)
2207 return NULL;
2208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002209 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210 for (;i < size; ++i) {
2211 Py_UNICODE ch = s[i];
2212
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 if (inShift) {
2214 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2215 /* shifting out */
2216 if (base64bits) { /* output remaining bits */
2217 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2218 base64buffer = 0;
2219 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220 }
2221 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002222 /* Characters not in the BASE64 set implicitly unshift the sequence
2223 so no '-' is required, except if the character is itself a '-' */
2224 if (IS_BASE64(ch) || ch == '-') {
2225 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002227 *out++ = (char) ch;
2228 }
2229 else {
2230 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002232 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002233 else { /* not in a shift sequence */
2234 if (ch == '+') {
2235 *out++ = '+';
2236 *out++ = '-';
2237 }
2238 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2239 *out++ = (char) ch;
2240 }
2241 else {
2242 *out++ = '+';
2243 inShift = 1;
2244 goto encode_char;
2245 }
2246 }
2247 continue;
2248encode_char:
2249#ifdef Py_UNICODE_WIDE
2250 if (ch >= 0x10000) {
2251 /* code first surrogate */
2252 base64bits += 16;
2253 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2254 while (base64bits >= 6) {
2255 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2256 base64bits -= 6;
2257 }
2258 /* prepare second surrogate */
2259 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2260 }
2261#endif
2262 base64bits += 16;
2263 base64buffer = (base64buffer << 16) | ch;
2264 while (base64bits >= 6) {
2265 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2266 base64bits -= 6;
2267 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002268 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002269 if (base64bits)
2270 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2271 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002272 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002273 if (_PyBytes_Resize(&v, out - start) < 0)
2274 return NULL;
2275 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002276}
2277
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278#undef IS_BASE64
2279#undef FROM_BASE64
2280#undef TO_BASE64
2281#undef DECODE_DIRECT
2282#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284/* --- UTF-8 Codec -------------------------------------------------------- */
2285
Tim Petersced69f82003-09-16 20:30:58 +00002286static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002288 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2289 illegal prefix. See RFC 3629 for details */
2290 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2291 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2292 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2294 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2296 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2302 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2303 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2304 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2305 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306};
2307
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002309 Py_ssize_t size,
2310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311{
Walter Dörwald69652032004-09-07 20:24:22 +00002312 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2313}
2314
Antoine Pitrouab868312009-01-10 15:40:25 +00002315/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2316#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2317
2318/* Mask to quickly check whether a C 'long' contains a
2319 non-ASCII, UTF8-encoded char. */
2320#if (SIZEOF_LONG == 8)
2321# define ASCII_CHAR_MASK 0x8080808080808080L
2322#elif (SIZEOF_LONG == 4)
2323# define ASCII_CHAR_MASK 0x80808080L
2324#else
2325# error C 'long' size should be either 4 or 8!
2326#endif
2327
Walter Dörwald69652032004-09-07 20:24:22 +00002328PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002329 Py_ssize_t size,
2330 const char *errors,
2331 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002333 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002335 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002336 Py_ssize_t startinpos;
2337 Py_ssize_t endinpos;
2338 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002339 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 PyUnicodeObject *unicode;
2341 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002342 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 PyObject *errorHandler = NULL;
2344 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345
2346 /* Note: size will always be longer than the resulting Unicode
2347 character count */
2348 unicode = _PyUnicode_New(size);
2349 if (!unicode)
2350 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002351 if (size == 0) {
2352 if (consumed)
2353 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
2357 /* Unpack UTF-8 encoded data */
2358 p = unicode->str;
2359 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002360 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361
2362 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002363 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364
2365 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002366 /* Fast path for runs of ASCII characters. Given that common UTF-8
2367 input will consist of an overwhelming majority of ASCII
2368 characters, we try to optimize for this case by checking
2369 as many characters as a C 'long' can contain.
2370 First, check if we can do an aligned read, as most CPUs have
2371 a penalty for unaligned reads.
2372 */
2373 if (!((size_t) s & LONG_PTR_MASK)) {
2374 /* Help register allocation */
2375 register const char *_s = s;
2376 register Py_UNICODE *_p = p;
2377 while (_s < aligned_end) {
2378 /* Read a whole long at a time (either 4 or 8 bytes),
2379 and do a fast unrolled copy if it only contains ASCII
2380 characters. */
2381 unsigned long data = *(unsigned long *) _s;
2382 if (data & ASCII_CHAR_MASK)
2383 break;
2384 _p[0] = (unsigned char) _s[0];
2385 _p[1] = (unsigned char) _s[1];
2386 _p[2] = (unsigned char) _s[2];
2387 _p[3] = (unsigned char) _s[3];
2388#if (SIZEOF_LONG == 8)
2389 _p[4] = (unsigned char) _s[4];
2390 _p[5] = (unsigned char) _s[5];
2391 _p[6] = (unsigned char) _s[6];
2392 _p[7] = (unsigned char) _s[7];
2393#endif
2394 _s += SIZEOF_LONG;
2395 _p += SIZEOF_LONG;
2396 }
2397 s = _s;
2398 p = _p;
2399 if (s == e)
2400 break;
2401 ch = (unsigned char)*s;
2402 }
2403 }
2404
2405 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002406 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 s++;
2408 continue;
2409 }
2410
2411 n = utf8_code_length[ch];
2412
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002413 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 if (consumed)
2415 break;
2416 else {
2417 errmsg = "unexpected end of data";
2418 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002419 endinpos = startinpos+1;
2420 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2421 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 goto utf8Error;
2423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425
2426 switch (n) {
2427
2428 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002429 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 startinpos = s-starts;
2431 endinpos = startinpos+1;
2432 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433
2434 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002435 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 startinpos = s-starts;
2437 endinpos = startinpos+1;
2438 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439
2440 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002441 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002442 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002443 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002444 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002445 goto utf8Error;
2446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002448 assert ((ch > 0x007F) && (ch <= 0x07FF));
2449 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450 break;
2451
2452 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002453 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2454 will result in surrogates in range d800-dfff. Surrogates are
2455 not valid UTF-8 so they are rejected.
2456 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2457 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002458 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002459 (s[2] & 0xc0) != 0x80 ||
2460 ((unsigned char)s[0] == 0xE0 &&
2461 (unsigned char)s[1] < 0xA0) ||
2462 ((unsigned char)s[0] == 0xED &&
2463 (unsigned char)s[1] > 0x9F)) {
2464 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002466 endinpos = startinpos + 1;
2467
2468 /* if s[1] first two bits are 1 and 0, then the invalid
2469 continuation byte is s[2], so increment endinpos by 1,
2470 if not, s[1] is invalid and endinpos doesn't need to
2471 be incremented. */
2472 if ((s[1] & 0xC0) == 0x80)
2473 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002474 goto utf8Error;
2475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002477 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2478 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002479 break;
2480
2481 case 4:
2482 if ((s[1] & 0xc0) != 0x80 ||
2483 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002484 (s[3] & 0xc0) != 0x80 ||
2485 ((unsigned char)s[0] == 0xF0 &&
2486 (unsigned char)s[1] < 0x90) ||
2487 ((unsigned char)s[0] == 0xF4 &&
2488 (unsigned char)s[1] > 0x8F)) {
2489 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002491 endinpos = startinpos + 1;
2492 if ((s[1] & 0xC0) == 0x80) {
2493 endinpos++;
2494 if ((s[2] & 0xC0) == 0x80)
2495 endinpos++;
2496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 goto utf8Error;
2498 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002499 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002500 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2501 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2502
Fredrik Lundh8f455852001-06-27 18:59:43 +00002503#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002504 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002505#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002506 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002507
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002508 /* translate from 10000..10FFFF to 0..FFFF */
2509 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002510
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002511 /* high surrogate = top 10 bits added to D800 */
2512 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002513
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002514 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002515 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002516#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 }
2519 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002520 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002521
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 utf8Error:
2523 outpos = p-PyUnicode_AS_UNICODE(unicode);
2524 if (unicode_decode_call_errorhandler(
2525 errors, &errorHandler,
2526 "utf8", errmsg,
2527 &starts, &e, &startinpos, &endinpos, &exc, &s,
2528 &unicode, &outpos, &p))
2529 goto onError;
2530 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 }
Walter Dörwald69652032004-09-07 20:24:22 +00002532 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534
2535 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002536 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 goto onError;
2538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 Py_XDECREF(errorHandler);
2540 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 return (PyObject *)unicode;
2542
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 Py_XDECREF(errorHandler);
2545 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 Py_DECREF(unicode);
2547 return NULL;
2548}
2549
Antoine Pitrouab868312009-01-10 15:40:25 +00002550#undef ASCII_CHAR_MASK
2551
2552
Tim Peters602f7402002-04-27 18:03:26 +00002553/* Allocation strategy: if the string is short, convert into a stack buffer
2554 and allocate exactly as much space needed at the end. Else allocate the
2555 maximum possible needed (4 result bytes per Unicode character), and return
2556 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002557*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002558PyObject *
2559PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 Py_ssize_t size,
2561 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562{
Tim Peters602f7402002-04-27 18:03:26 +00002563#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002564
Guido van Rossum98297ee2007-11-06 21:34:58 +00002565 Py_ssize_t i; /* index into s of next input byte */
2566 PyObject *result; /* result string object */
2567 char *p; /* next free byte in output buffer */
2568 Py_ssize_t nallocated; /* number of result bytes allocated */
2569 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002570 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002571 PyObject *errorHandler = NULL;
2572 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002573
Tim Peters602f7402002-04-27 18:03:26 +00002574 assert(s != NULL);
2575 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576
Tim Peters602f7402002-04-27 18:03:26 +00002577 if (size <= MAX_SHORT_UNICHARS) {
2578 /* Write into the stack buffer; nallocated can't overflow.
2579 * At the end, we'll allocate exactly as much heap space as it
2580 * turns out we need.
2581 */
2582 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002583 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002584 p = stackbuf;
2585 }
2586 else {
2587 /* Overallocate on the heap, and give the excess back at the end. */
2588 nallocated = size * 4;
2589 if (nallocated / 4 != size) /* overflow! */
2590 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002591 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002592 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002593 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002594 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002595 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002596
Tim Peters602f7402002-04-27 18:03:26 +00002597 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002598 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002599
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002600 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002601 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002605 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002606 *p++ = (char)(0xc0 | (ch >> 6));
2607 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002608 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002609#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002610 /* Special case: check for high and low surrogate */
2611 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2612 Py_UCS4 ch2 = s[i];
2613 /* Combine the two surrogates to form a UCS4 value */
2614 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2615 i++;
2616
2617 /* Encode UCS4 Unicode ordinals */
2618 *p++ = (char)(0xf0 | (ch >> 18));
2619 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002620 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2621 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002622 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002623#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002624 Py_ssize_t newpos;
2625 PyObject *rep;
2626 Py_ssize_t repsize, k;
2627 rep = unicode_encode_call_errorhandler
2628 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2629 s, size, &exc, i-1, i, &newpos);
2630 if (!rep)
2631 goto error;
2632
2633 if (PyBytes_Check(rep))
2634 repsize = PyBytes_GET_SIZE(rep);
2635 else
2636 repsize = PyUnicode_GET_SIZE(rep);
2637
2638 if (repsize > 4) {
2639 Py_ssize_t offset;
2640
2641 if (result == NULL)
2642 offset = p - stackbuf;
2643 else
2644 offset = p - PyBytes_AS_STRING(result);
2645
2646 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2647 /* integer overflow */
2648 PyErr_NoMemory();
2649 goto error;
2650 }
2651 nallocated += repsize - 4;
2652 if (result != NULL) {
2653 if (_PyBytes_Resize(&result, nallocated) < 0)
2654 goto error;
2655 } else {
2656 result = PyBytes_FromStringAndSize(NULL, nallocated);
2657 if (result == NULL)
2658 goto error;
2659 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2660 }
2661 p = PyBytes_AS_STRING(result) + offset;
2662 }
2663
2664 if (PyBytes_Check(rep)) {
2665 char *prep = PyBytes_AS_STRING(rep);
2666 for(k = repsize; k > 0; k--)
2667 *p++ = *prep++;
2668 } else /* rep is unicode */ {
2669 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2670 Py_UNICODE c;
2671
2672 for(k=0; k<repsize; k++) {
2673 c = prep[k];
2674 if (0x80 <= c) {
2675 raise_encode_exception(&exc, "utf-8", s, size,
2676 i-1, i, "surrogates not allowed");
2677 goto error;
2678 }
2679 *p++ = (char)prep[k];
2680 }
2681 }
2682 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002683#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002684 }
Victor Stinner445a6232010-04-22 20:01:57 +00002685#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002686 } else if (ch < 0x10000) {
2687 *p++ = (char)(0xe0 | (ch >> 12));
2688 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2689 *p++ = (char)(0x80 | (ch & 0x3f));
2690 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002691 /* Encode UCS4 Unicode ordinals */
2692 *p++ = (char)(0xf0 | (ch >> 18));
2693 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2694 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2695 *p++ = (char)(0x80 | (ch & 0x3f));
2696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002698
Guido van Rossum98297ee2007-11-06 21:34:58 +00002699 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002700 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002701 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002702 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002703 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002704 }
2705 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002706 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002707 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002708 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002709 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002710 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002713 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002714 error:
2715 Py_XDECREF(errorHandler);
2716 Py_XDECREF(exc);
2717 Py_XDECREF(result);
2718 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002719
Tim Peters602f7402002-04-27 18:03:26 +00002720#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2724{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 if (!PyUnicode_Check(unicode)) {
2726 PyErr_BadArgument();
2727 return NULL;
2728 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002729 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 PyUnicode_GET_SIZE(unicode),
2731 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732}
2733
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734/* --- UTF-32 Codec ------------------------------------------------------- */
2735
2736PyObject *
2737PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 Py_ssize_t size,
2739 const char *errors,
2740 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002741{
2742 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2743}
2744
2745PyObject *
2746PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 Py_ssize_t size,
2748 const char *errors,
2749 int *byteorder,
2750 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002751{
2752 const char *starts = s;
2753 Py_ssize_t startinpos;
2754 Py_ssize_t endinpos;
2755 Py_ssize_t outpos;
2756 PyUnicodeObject *unicode;
2757 Py_UNICODE *p;
2758#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002759 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002760 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002761#else
2762 const int pairs = 0;
2763#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002764 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002765 int bo = 0; /* assume native ordering by default */
2766 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002767 /* Offsets from q for retrieving bytes in the right order. */
2768#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2769 int iorder[] = {0, 1, 2, 3};
2770#else
2771 int iorder[] = {3, 2, 1, 0};
2772#endif
2773 PyObject *errorHandler = NULL;
2774 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002775
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776 q = (unsigned char *)s;
2777 e = q + size;
2778
2779 if (byteorder)
2780 bo = *byteorder;
2781
2782 /* Check for BOM marks (U+FEFF) in the input and adjust current
2783 byte order setting accordingly. In native mode, the leading BOM
2784 mark is skipped, in all other modes, it is copied to the output
2785 stream as-is (giving a ZWNBSP character). */
2786 if (bo == 0) {
2787 if (size >= 4) {
2788 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002791 if (bom == 0x0000FEFF) {
2792 q += 4;
2793 bo = -1;
2794 }
2795 else if (bom == 0xFFFE0000) {
2796 q += 4;
2797 bo = 1;
2798 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 if (bom == 0x0000FEFF) {
2801 q += 4;
2802 bo = 1;
2803 }
2804 else if (bom == 0xFFFE0000) {
2805 q += 4;
2806 bo = -1;
2807 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002808#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002810 }
2811
2812 if (bo == -1) {
2813 /* force LE */
2814 iorder[0] = 0;
2815 iorder[1] = 1;
2816 iorder[2] = 2;
2817 iorder[3] = 3;
2818 }
2819 else if (bo == 1) {
2820 /* force BE */
2821 iorder[0] = 3;
2822 iorder[1] = 2;
2823 iorder[2] = 1;
2824 iorder[3] = 0;
2825 }
2826
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002827 /* On narrow builds we split characters outside the BMP into two
2828 codepoints => count how much extra space we need. */
2829#ifndef Py_UNICODE_WIDE
2830 for (qq = q; qq < e; qq += 4)
2831 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2832 pairs++;
2833#endif
2834
2835 /* This might be one to much, because of a BOM */
2836 unicode = _PyUnicode_New((size+3)/4+pairs);
2837 if (!unicode)
2838 return NULL;
2839 if (size == 0)
2840 return (PyObject *)unicode;
2841
2842 /* Unpack UTF-32 encoded data */
2843 p = unicode->str;
2844
Walter Dörwald41980ca2007-08-16 21:55:45 +00002845 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 Py_UCS4 ch;
2847 /* remaining bytes at the end? (size should be divisible by 4) */
2848 if (e-q<4) {
2849 if (consumed)
2850 break;
2851 errmsg = "truncated data";
2852 startinpos = ((const char *)q)-starts;
2853 endinpos = ((const char *)e)-starts;
2854 goto utf32Error;
2855 /* The remaining input chars are ignored if the callback
2856 chooses to skip the input */
2857 }
2858 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2859 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002860
Benjamin Peterson29060642009-01-31 22:14:21 +00002861 if (ch >= 0x110000)
2862 {
2863 errmsg = "codepoint not in range(0x110000)";
2864 startinpos = ((const char *)q)-starts;
2865 endinpos = startinpos+4;
2866 goto utf32Error;
2867 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002868#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 if (ch >= 0x10000)
2870 {
2871 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2872 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2873 }
2874 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002875#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 *p++ = ch;
2877 q += 4;
2878 continue;
2879 utf32Error:
2880 outpos = p-PyUnicode_AS_UNICODE(unicode);
2881 if (unicode_decode_call_errorhandler(
2882 errors, &errorHandler,
2883 "utf32", errmsg,
2884 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2885 &unicode, &outpos, &p))
2886 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002887 }
2888
2889 if (byteorder)
2890 *byteorder = bo;
2891
2892 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002894
2895 /* Adjust length */
2896 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2897 goto onError;
2898
2899 Py_XDECREF(errorHandler);
2900 Py_XDECREF(exc);
2901 return (PyObject *)unicode;
2902
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002904 Py_DECREF(unicode);
2905 Py_XDECREF(errorHandler);
2906 Py_XDECREF(exc);
2907 return NULL;
2908}
2909
2910PyObject *
2911PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 Py_ssize_t size,
2913 const char *errors,
2914 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002915{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002916 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002917 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002918 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002919#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002920 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002921#else
2922 const int pairs = 0;
2923#endif
2924 /* Offsets from p for storing byte pairs in the right order. */
2925#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2926 int iorder[] = {0, 1, 2, 3};
2927#else
2928 int iorder[] = {3, 2, 1, 0};
2929#endif
2930
Benjamin Peterson29060642009-01-31 22:14:21 +00002931#define STORECHAR(CH) \
2932 do { \
2933 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2934 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2935 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2936 p[iorder[0]] = (CH) & 0xff; \
2937 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002938 } while(0)
2939
2940 /* In narrow builds we can output surrogate pairs as one codepoint,
2941 so we need less space. */
2942#ifndef Py_UNICODE_WIDE
2943 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002944 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2945 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2946 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002947#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002948 nsize = (size - pairs + (byteorder == 0));
2949 bytesize = nsize * 4;
2950 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002952 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002953 if (v == NULL)
2954 return NULL;
2955
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002956 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002957 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002959 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002960 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002961
2962 if (byteorder == -1) {
2963 /* force LE */
2964 iorder[0] = 0;
2965 iorder[1] = 1;
2966 iorder[2] = 2;
2967 iorder[3] = 3;
2968 }
2969 else if (byteorder == 1) {
2970 /* force BE */
2971 iorder[0] = 3;
2972 iorder[1] = 2;
2973 iorder[2] = 1;
2974 iorder[3] = 0;
2975 }
2976
2977 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002979#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2981 Py_UCS4 ch2 = *s;
2982 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2983 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2984 s++;
2985 size--;
2986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002987 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002988#endif
2989 STORECHAR(ch);
2990 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002991
2992 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002993 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002994#undef STORECHAR
2995}
2996
2997PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2998{
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 return NULL;
3002 }
3003 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 PyUnicode_GET_SIZE(unicode),
3005 NULL,
3006 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003007}
3008
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009/* --- UTF-16 Codec ------------------------------------------------------- */
3010
Tim Peters772747b2001-08-09 22:21:55 +00003011PyObject *
3012PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 Py_ssize_t size,
3014 const char *errors,
3015 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016{
Walter Dörwald69652032004-09-07 20:24:22 +00003017 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3018}
3019
Antoine Pitrouab868312009-01-10 15:40:25 +00003020/* Two masks for fast checking of whether a C 'long' may contain
3021 UTF16-encoded surrogate characters. This is an efficient heuristic,
3022 assuming that non-surrogate characters with a code point >= 0x8000 are
3023 rare in most input.
3024 FAST_CHAR_MASK is used when the input is in native byte ordering,
3025 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003026*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003027#if (SIZEOF_LONG == 8)
3028# define FAST_CHAR_MASK 0x8000800080008000L
3029# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3030#elif (SIZEOF_LONG == 4)
3031# define FAST_CHAR_MASK 0x80008000L
3032# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3033#else
3034# error C 'long' size should be either 4 or 8!
3035#endif
3036
Walter Dörwald69652032004-09-07 20:24:22 +00003037PyObject *
3038PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 Py_ssize_t size,
3040 const char *errors,
3041 int *byteorder,
3042 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003045 Py_ssize_t startinpos;
3046 Py_ssize_t endinpos;
3047 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 PyUnicodeObject *unicode;
3049 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003050 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003051 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003052 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003053 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003054 /* Offsets from q for retrieving byte pairs in the right order. */
3055#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3056 int ihi = 1, ilo = 0;
3057#else
3058 int ihi = 0, ilo = 1;
3059#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 PyObject *errorHandler = NULL;
3061 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062
3063 /* Note: size will always be longer than the resulting Unicode
3064 character count */
3065 unicode = _PyUnicode_New(size);
3066 if (!unicode)
3067 return NULL;
3068 if (size == 0)
3069 return (PyObject *)unicode;
3070
3071 /* Unpack UTF-16 encoded data */
3072 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003073 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003074 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075
3076 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003077 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003079 /* Check for BOM marks (U+FEFF) in the input and adjust current
3080 byte order setting accordingly. In native mode, the leading BOM
3081 mark is skipped, in all other modes, it is copied to the output
3082 stream as-is (giving a ZWNBSP character). */
3083 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003084 if (size >= 2) {
3085 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003086#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 if (bom == 0xFEFF) {
3088 q += 2;
3089 bo = -1;
3090 }
3091 else if (bom == 0xFFFE) {
3092 q += 2;
3093 bo = 1;
3094 }
Tim Petersced69f82003-09-16 20:30:58 +00003095#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 if (bom == 0xFEFF) {
3097 q += 2;
3098 bo = 1;
3099 }
3100 else if (bom == 0xFFFE) {
3101 q += 2;
3102 bo = -1;
3103 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003104#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107
Tim Peters772747b2001-08-09 22:21:55 +00003108 if (bo == -1) {
3109 /* force LE */
3110 ihi = 1;
3111 ilo = 0;
3112 }
3113 else if (bo == 1) {
3114 /* force BE */
3115 ihi = 0;
3116 ilo = 1;
3117 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3119 native_ordering = ilo < ihi;
3120#else
3121 native_ordering = ilo > ihi;
3122#endif
Tim Peters772747b2001-08-09 22:21:55 +00003123
Antoine Pitrouab868312009-01-10 15:40:25 +00003124 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003125 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003127 /* First check for possible aligned read of a C 'long'. Unaligned
3128 reads are more expensive, better to defer to another iteration. */
3129 if (!((size_t) q & LONG_PTR_MASK)) {
3130 /* Fast path for runs of non-surrogate chars. */
3131 register const unsigned char *_q = q;
3132 Py_UNICODE *_p = p;
3133 if (native_ordering) {
3134 /* Native ordering is simple: as long as the input cannot
3135 possibly contain a surrogate char, do an unrolled copy
3136 of several 16-bit code points to the target object.
3137 The non-surrogate check is done on several input bytes
3138 at a time (as many as a C 'long' can contain). */
3139 while (_q < aligned_end) {
3140 unsigned long data = * (unsigned long *) _q;
3141 if (data & FAST_CHAR_MASK)
3142 break;
3143 _p[0] = ((unsigned short *) _q)[0];
3144 _p[1] = ((unsigned short *) _q)[1];
3145#if (SIZEOF_LONG == 8)
3146 _p[2] = ((unsigned short *) _q)[2];
3147 _p[3] = ((unsigned short *) _q)[3];
3148#endif
3149 _q += SIZEOF_LONG;
3150 _p += SIZEOF_LONG / 2;
3151 }
3152 }
3153 else {
3154 /* Byteswapped ordering is similar, but we must decompose
3155 the copy bytewise, and take care of zero'ing out the
3156 upper bytes if the target object is in 32-bit units
3157 (that is, in UCS-4 builds). */
3158 while (_q < aligned_end) {
3159 unsigned long data = * (unsigned long *) _q;
3160 if (data & SWAPPED_FAST_CHAR_MASK)
3161 break;
3162 /* Zero upper bytes in UCS-4 builds */
3163#if (Py_UNICODE_SIZE > 2)
3164 _p[0] = 0;
3165 _p[1] = 0;
3166#if (SIZEOF_LONG == 8)
3167 _p[2] = 0;
3168 _p[3] = 0;
3169#endif
3170#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003171 /* Issue #4916; UCS-4 builds on big endian machines must
3172 fill the two last bytes of each 4-byte unit. */
3173#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3174# define OFF 2
3175#else
3176# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003177#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003178 ((unsigned char *) _p)[OFF + 1] = _q[0];
3179 ((unsigned char *) _p)[OFF + 0] = _q[1];
3180 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3181 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3182#if (SIZEOF_LONG == 8)
3183 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3184 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3185 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3186 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3187#endif
3188#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003189 _q += SIZEOF_LONG;
3190 _p += SIZEOF_LONG / 2;
3191 }
3192 }
3193 p = _p;
3194 q = _q;
3195 if (q >= e)
3196 break;
3197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003198 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199
Benjamin Peterson14339b62009-01-31 16:36:08 +00003200 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003201
3202 if (ch < 0xD800 || ch > 0xDFFF) {
3203 *p++ = ch;
3204 continue;
3205 }
3206
3207 /* UTF-16 code pair: */
3208 if (q > e) {
3209 errmsg = "unexpected end of data";
3210 startinpos = (((const char *)q) - 2) - starts;
3211 endinpos = ((const char *)e) + 1 - starts;
3212 goto utf16Error;
3213 }
3214 if (0xD800 <= ch && ch <= 0xDBFF) {
3215 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3216 q += 2;
3217 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003218#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 *p++ = ch;
3220 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003221#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003223#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 continue;
3225 }
3226 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003227 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 startinpos = (((const char *)q)-4)-starts;
3229 endinpos = startinpos+2;
3230 goto utf16Error;
3231 }
3232
Benjamin Peterson14339b62009-01-31 16:36:08 +00003233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 errmsg = "illegal encoding";
3235 startinpos = (((const char *)q)-2)-starts;
3236 endinpos = startinpos+2;
3237 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003238
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 utf16Error:
3240 outpos = p - PyUnicode_AS_UNICODE(unicode);
3241 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003242 errors,
3243 &errorHandler,
3244 "utf16", errmsg,
3245 &starts,
3246 (const char **)&e,
3247 &startinpos,
3248 &endinpos,
3249 &exc,
3250 (const char **)&q,
3251 &unicode,
3252 &outpos,
3253 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003256 /* remaining byte at the end? (size should be even) */
3257 if (e == q) {
3258 if (!consumed) {
3259 errmsg = "truncated data";
3260 startinpos = ((const char *)q) - starts;
3261 endinpos = ((const char *)e) + 1 - starts;
3262 outpos = p - PyUnicode_AS_UNICODE(unicode);
3263 if (unicode_decode_call_errorhandler(
3264 errors,
3265 &errorHandler,
3266 "utf16", errmsg,
3267 &starts,
3268 (const char **)&e,
3269 &startinpos,
3270 &endinpos,
3271 &exc,
3272 (const char **)&q,
3273 &unicode,
3274 &outpos,
3275 &p))
3276 goto onError;
3277 /* The remaining input chars are ignored if the callback
3278 chooses to skip the input */
3279 }
3280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281
3282 if (byteorder)
3283 *byteorder = bo;
3284
Walter Dörwald69652032004-09-07 20:24:22 +00003285 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003287
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003289 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 goto onError;
3291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 Py_XDECREF(errorHandler);
3293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 return (PyObject *)unicode;
3295
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 Py_XDECREF(errorHandler);
3299 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 return NULL;
3301}
3302
Antoine Pitrouab868312009-01-10 15:40:25 +00003303#undef FAST_CHAR_MASK
3304#undef SWAPPED_FAST_CHAR_MASK
3305
Tim Peters772747b2001-08-09 22:21:55 +00003306PyObject *
3307PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 Py_ssize_t size,
3309 const char *errors,
3310 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003312 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003313 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003314 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003315#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003316 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003317#else
3318 const int pairs = 0;
3319#endif
Tim Peters772747b2001-08-09 22:21:55 +00003320 /* Offsets from p for storing byte pairs in the right order. */
3321#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3322 int ihi = 1, ilo = 0;
3323#else
3324 int ihi = 0, ilo = 1;
3325#endif
3326
Benjamin Peterson29060642009-01-31 22:14:21 +00003327#define STORECHAR(CH) \
3328 do { \
3329 p[ihi] = ((CH) >> 8) & 0xff; \
3330 p[ilo] = (CH) & 0xff; \
3331 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003332 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003334#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003335 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 if (s[i] >= 0x10000)
3337 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003338#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003339 /* 2 * (size + pairs + (byteorder == 0)) */
3340 if (size > PY_SSIZE_T_MAX ||
3341 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003343 nsize = size + pairs + (byteorder == 0);
3344 bytesize = nsize * 2;
3345 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003347 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 if (v == NULL)
3349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003351 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003354 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003355 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003356
3357 if (byteorder == -1) {
3358 /* force LE */
3359 ihi = 1;
3360 ilo = 0;
3361 }
3362 else if (byteorder == 1) {
3363 /* force BE */
3364 ihi = 0;
3365 ilo = 1;
3366 }
3367
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003368 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 Py_UNICODE ch = *s++;
3370 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003371#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 if (ch >= 0x10000) {
3373 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3374 ch = 0xD800 | ((ch-0x10000) >> 10);
3375 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003376#endif
Tim Peters772747b2001-08-09 22:21:55 +00003377 STORECHAR(ch);
3378 if (ch2)
3379 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003380 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003381
3382 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003383 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003384#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385}
3386
3387PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3388{
3389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
3391 return NULL;
3392 }
3393 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 PyUnicode_GET_SIZE(unicode),
3395 NULL,
3396 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397}
3398
3399/* --- Unicode Escape Codec ----------------------------------------------- */
3400
Fredrik Lundh06d12682001-01-24 07:59:11 +00003401static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003402
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 Py_ssize_t size,
3405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003408 Py_ssize_t startinpos;
3409 Py_ssize_t endinpos;
3410 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003415 char* message;
3416 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 PyObject *errorHandler = NULL;
3418 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003419
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 /* Escaped strings will always be longer than the resulting
3421 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 length after conversion to the true value.
3423 (but if the error callback returns a long replacement string
3424 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 v = _PyUnicode_New(size);
3426 if (v == NULL)
3427 goto onError;
3428 if (size == 0)
3429 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003433
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 while (s < end) {
3435 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003436 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438
3439 /* Non-escape characters are interpreted as Unicode ordinals */
3440 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003441 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 continue;
3443 }
3444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 /* \ - Escapes */
3447 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003448 c = *s++;
3449 if (s > end)
3450 c = '\0'; /* Invalid after \ */
3451 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 case '\n': break;
3455 case '\\': *p++ = '\\'; break;
3456 case '\'': *p++ = '\''; break;
3457 case '\"': *p++ = '\"'; break;
3458 case 'b': *p++ = '\b'; break;
3459 case 'f': *p++ = '\014'; break; /* FF */
3460 case 't': *p++ = '\t'; break;
3461 case 'n': *p++ = '\n'; break;
3462 case 'r': *p++ = '\r'; break;
3463 case 'v': *p++ = '\013'; break; /* VT */
3464 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3465
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 case '0': case '1': case '2': case '3':
3468 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003469 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003470 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003471 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003472 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003473 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003475 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 break;
3477
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 /* hex escapes */
3479 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003481 digits = 2;
3482 message = "truncated \\xXX escape";
3483 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003487 digits = 4;
3488 message = "truncated \\uXXXX escape";
3489 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003492 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003493 digits = 8;
3494 message = "truncated \\UXXXXXXXX escape";
3495 hexescape:
3496 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 outpos = p-PyUnicode_AS_UNICODE(v);
3498 if (s+digits>end) {
3499 endinpos = size;
3500 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 errors, &errorHandler,
3502 "unicodeescape", "end of string in escape sequence",
3503 &starts, &end, &startinpos, &endinpos, &exc, &s,
3504 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 goto onError;
3506 goto nextByte;
3507 }
3508 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003509 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003510 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 endinpos = (s+i+1)-starts;
3512 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 errors, &errorHandler,
3514 "unicodeescape", message,
3515 &starts, &end, &startinpos, &endinpos, &exc, &s,
3516 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003517 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003519 }
3520 chr = (chr<<4) & ~0xF;
3521 if (c >= '0' && c <= '9')
3522 chr += c - '0';
3523 else if (c >= 'a' && c <= 'f')
3524 chr += 10 + c - 'a';
3525 else
3526 chr += 10 + c - 'A';
3527 }
3528 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003529 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 /* _decoding_error will have already written into the
3531 target buffer. */
3532 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003533 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003534 /* when we get here, chr is a 32-bit unicode character */
3535 if (chr <= 0xffff)
3536 /* UCS-2 character */
3537 *p++ = (Py_UNICODE) chr;
3538 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003539 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003540 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003541#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003542 *p++ = chr;
3543#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003544 chr -= 0x10000L;
3545 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003546 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003547#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003548 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 endinpos = s-starts;
3550 outpos = p-PyUnicode_AS_UNICODE(v);
3551 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 errors, &errorHandler,
3553 "unicodeescape", "illegal Unicode character",
3554 &starts, &end, &startinpos, &endinpos, &exc, &s,
3555 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003556 goto onError;
3557 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003558 break;
3559
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003561 case 'N':
3562 message = "malformed \\N character escape";
3563 if (ucnhash_CAPI == NULL) {
3564 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003565 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003566 if (ucnhash_CAPI == NULL)
3567 goto ucnhashError;
3568 }
3569 if (*s == '{') {
3570 const char *start = s+1;
3571 /* look for the closing brace */
3572 while (*s != '}' && s < end)
3573 s++;
3574 if (s > start && s < end && *s == '}') {
3575 /* found a name. look it up in the unicode database */
3576 message = "unknown Unicode character name";
3577 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003578 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003579 goto store;
3580 }
3581 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 endinpos = s-starts;
3583 outpos = p-PyUnicode_AS_UNICODE(v);
3584 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 errors, &errorHandler,
3586 "unicodeescape", message,
3587 &starts, &end, &startinpos, &endinpos, &exc, &s,
3588 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003589 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003590 break;
3591
3592 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003593 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 message = "\\ at end of string";
3595 s--;
3596 endinpos = s-starts;
3597 outpos = p-PyUnicode_AS_UNICODE(v);
3598 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 errors, &errorHandler,
3600 "unicodeescape", message,
3601 &starts, &end, &startinpos, &endinpos, &exc, &s,
3602 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003603 goto onError;
3604 }
3605 else {
3606 *p++ = '\\';
3607 *p++ = (unsigned char)s[-1];
3608 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003609 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003614 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003616 Py_XDECREF(errorHandler);
3617 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003619
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003621 PyErr_SetString(
3622 PyExc_UnicodeError,
3623 "\\N escapes not supported (can't load unicodedata module)"
3624 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003625 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 Py_XDECREF(errorHandler);
3627 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003628 return NULL;
3629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 Py_XDECREF(errorHandler);
3633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 return NULL;
3635}
3636
3637/* Return a Unicode-Escape string version of the Unicode object.
3638
3639 If quotes is true, the string is enclosed in u"" or u'' quotes as
3640 appropriate.
3641
3642*/
3643
Thomas Wouters477c8d52006-05-27 19:21:47 +00003644Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 Py_ssize_t size,
3646 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003647{
3648 /* like wcschr, but doesn't stop at NULL characters */
3649
3650 while (size-- > 0) {
3651 if (*s == ch)
3652 return s;
3653 s++;
3654 }
3655
3656 return NULL;
3657}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003658
Walter Dörwald79e913e2007-05-12 11:08:06 +00003659static const char *hexdigits = "0123456789abcdef";
3660
3661PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003667#ifdef Py_UNICODE_WIDE
3668 const Py_ssize_t expandsize = 10;
3669#else
3670 const Py_ssize_t expandsize = 6;
3671#endif
3672
Thomas Wouters89f507f2006-12-13 04:49:30 +00003673 /* XXX(nnorwitz): rather than over-allocating, it would be
3674 better to choose a different scheme. Perhaps scan the
3675 first N-chars of the string and allocate based on that size.
3676 */
3677 /* Initial allocation is based on the longest-possible unichr
3678 escape.
3679
3680 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3681 unichr, so in this case it's the longest unichr escape. In
3682 narrow (UTF-16) builds this is five chars per source unichr
3683 since there are two unichrs in the surrogate pair, so in narrow
3684 (UTF-16) builds it's not the longest unichr escape.
3685
3686 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3687 so in the narrow (UTF-16) build case it's the longest unichr
3688 escape.
3689 */
3690
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003691 if (size == 0)
3692 return PyBytes_FromStringAndSize(NULL, 0);
3693
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003694 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003696
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003697 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 2
3699 + expandsize*size
3700 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 if (repr == NULL)
3702 return NULL;
3703
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003704 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 while (size-- > 0) {
3707 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003708
Walter Dörwald79e913e2007-05-12 11:08:06 +00003709 /* Escape backslashes */
3710 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 *p++ = '\\';
3712 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003713 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003714 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003715
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003717 /* Map 21-bit characters to '\U00xxxxxx' */
3718 else if (ch >= 0x10000) {
3719 *p++ = '\\';
3720 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003721 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3722 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3723 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3724 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3725 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3726 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3727 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3728 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003730 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003731#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3733 else if (ch >= 0xD800 && ch < 0xDC00) {
3734 Py_UNICODE ch2;
3735 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003736
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 ch2 = *s++;
3738 size--;
3739 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3740 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3741 *p++ = '\\';
3742 *p++ = 'U';
3743 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3744 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3745 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3746 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3747 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3748 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3749 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3750 *p++ = hexdigits[ucs & 0x0000000F];
3751 continue;
3752 }
3753 /* Fall through: isolated surrogates are copied as-is */
3754 s--;
3755 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003756 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003757#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003758
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003760 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 *p++ = '\\';
3762 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003763 *p++ = hexdigits[(ch >> 12) & 0x000F];
3764 *p++ = hexdigits[(ch >> 8) & 0x000F];
3765 *p++ = hexdigits[(ch >> 4) & 0x000F];
3766 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003768
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003769 /* Map special whitespace to '\t', \n', '\r' */
3770 else if (ch == '\t') {
3771 *p++ = '\\';
3772 *p++ = 't';
3773 }
3774 else if (ch == '\n') {
3775 *p++ = '\\';
3776 *p++ = 'n';
3777 }
3778 else if (ch == '\r') {
3779 *p++ = '\\';
3780 *p++ = 'r';
3781 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003782
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003783 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003784 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003786 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003787 *p++ = hexdigits[(ch >> 4) & 0x000F];
3788 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003789 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003790
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 /* Copy everything else as-is */
3792 else
3793 *p++ = (char) ch;
3794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003796 assert(p - PyBytes_AS_STRING(repr) > 0);
3797 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3798 return NULL;
3799 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800}
3801
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003802PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003804 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 if (!PyUnicode_Check(unicode)) {
3806 PyErr_BadArgument();
3807 return NULL;
3808 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003809 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3810 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003811 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812}
3813
3814/* --- Raw Unicode Escape Codec ------------------------------------------- */
3815
3816PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 Py_ssize_t size,
3818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t startinpos;
3822 Py_ssize_t endinpos;
3823 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 const char *end;
3827 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 PyObject *errorHandler = NULL;
3829 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003830
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 /* Escaped strings will always be longer than the resulting
3832 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 length after conversion to the true value. (But decoding error
3834 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 v = _PyUnicode_New(size);
3836 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 end = s + size;
3842 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 unsigned char c;
3844 Py_UCS4 x;
3845 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003846 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 /* Non-escape characters are interpreted as Unicode ordinals */
3849 if (*s != '\\') {
3850 *p++ = (unsigned char)*s++;
3851 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 startinpos = s-starts;
3854
3855 /* \u-escapes are only interpreted iff the number of leading
3856 backslashes if odd */
3857 bs = s;
3858 for (;s < end;) {
3859 if (*s != '\\')
3860 break;
3861 *p++ = (unsigned char)*s++;
3862 }
3863 if (((s - bs) & 1) == 0 ||
3864 s >= end ||
3865 (*s != 'u' && *s != 'U')) {
3866 continue;
3867 }
3868 p--;
3869 count = *s=='u' ? 4 : 8;
3870 s++;
3871
3872 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3873 outpos = p-PyUnicode_AS_UNICODE(v);
3874 for (x = 0, i = 0; i < count; ++i, ++s) {
3875 c = (unsigned char)*s;
3876 if (!ISXDIGIT(c)) {
3877 endinpos = s-starts;
3878 if (unicode_decode_call_errorhandler(
3879 errors, &errorHandler,
3880 "rawunicodeescape", "truncated \\uXXXX",
3881 &starts, &end, &startinpos, &endinpos, &exc, &s,
3882 &v, &outpos, &p))
3883 goto onError;
3884 goto nextByte;
3885 }
3886 x = (x<<4) & ~0xF;
3887 if (c >= '0' && c <= '9')
3888 x += c - '0';
3889 else if (c >= 'a' && c <= 'f')
3890 x += 10 + c - 'a';
3891 else
3892 x += 10 + c - 'A';
3893 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003894 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 /* UCS-2 character */
3896 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003897 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 /* UCS-4 character. Either store directly, or as
3899 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003900#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003902#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 x -= 0x10000L;
3904 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3905 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003906#endif
3907 } else {
3908 endinpos = s-starts;
3909 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 &starts, &end, &startinpos, &endinpos, &exc, &s,
3914 &v, &outpos, &p))
3915 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 nextByte:
3918 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003920 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 Py_XDECREF(errorHandler);
3923 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003925
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 Py_XDECREF(errorHandler);
3929 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 return NULL;
3931}
3932
3933PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003936 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 char *p;
3938 char *q;
3939
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003940#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003941 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003942#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003943 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003944#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003945
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003946 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003947 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003948
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003949 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 if (repr == NULL)
3951 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003952 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003953 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003955 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 while (size-- > 0) {
3957 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003958#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 /* Map 32-bit characters to '\Uxxxxxxxx' */
3960 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003961 *p++ = '\\';
3962 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003963 *p++ = hexdigits[(ch >> 28) & 0xf];
3964 *p++ = hexdigits[(ch >> 24) & 0xf];
3965 *p++ = hexdigits[(ch >> 20) & 0xf];
3966 *p++ = hexdigits[(ch >> 16) & 0xf];
3967 *p++ = hexdigits[(ch >> 12) & 0xf];
3968 *p++ = hexdigits[(ch >> 8) & 0xf];
3969 *p++ = hexdigits[(ch >> 4) & 0xf];
3970 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003971 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003972 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003973#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3975 if (ch >= 0xD800 && ch < 0xDC00) {
3976 Py_UNICODE ch2;
3977 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003978
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 ch2 = *s++;
3980 size--;
3981 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3982 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3983 *p++ = '\\';
3984 *p++ = 'U';
3985 *p++ = hexdigits[(ucs >> 28) & 0xf];
3986 *p++ = hexdigits[(ucs >> 24) & 0xf];
3987 *p++ = hexdigits[(ucs >> 20) & 0xf];
3988 *p++ = hexdigits[(ucs >> 16) & 0xf];
3989 *p++ = hexdigits[(ucs >> 12) & 0xf];
3990 *p++ = hexdigits[(ucs >> 8) & 0xf];
3991 *p++ = hexdigits[(ucs >> 4) & 0xf];
3992 *p++ = hexdigits[ucs & 0xf];
3993 continue;
3994 }
3995 /* Fall through: isolated surrogates are copied as-is */
3996 s--;
3997 size++;
3998 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003999#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 /* Map 16-bit characters to '\uxxxx' */
4001 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 *p++ = '\\';
4003 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004004 *p++ = hexdigits[(ch >> 12) & 0xf];
4005 *p++ = hexdigits[(ch >> 8) & 0xf];
4006 *p++ = hexdigits[(ch >> 4) & 0xf];
4007 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 /* Copy everything else as-is */
4010 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 *p++ = (char) ch;
4012 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004013 size = p - q;
4014
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004015 assert(size > 0);
4016 if (_PyBytes_Resize(&repr, size) < 0)
4017 return NULL;
4018 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019}
4020
4021PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4022{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004023 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004025 PyErr_BadArgument();
4026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004028 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4029 PyUnicode_GET_SIZE(unicode));
4030
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004031 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032}
4033
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004034/* --- Unicode Internal Codec ------------------------------------------- */
4035
4036PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 Py_ssize_t size,
4038 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004039{
4040 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 Py_ssize_t startinpos;
4042 Py_ssize_t endinpos;
4043 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004044 PyUnicodeObject *v;
4045 Py_UNICODE *p;
4046 const char *end;
4047 const char *reason;
4048 PyObject *errorHandler = NULL;
4049 PyObject *exc = NULL;
4050
Neal Norwitzd43069c2006-01-08 01:12:10 +00004051#ifdef Py_UNICODE_WIDE
4052 Py_UNICODE unimax = PyUnicode_GetMax();
4053#endif
4054
Thomas Wouters89f507f2006-12-13 04:49:30 +00004055 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004056 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4057 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004059 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004061 p = PyUnicode_AS_UNICODE(v);
4062 end = s + size;
4063
4064 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004065 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004066 /* We have to sanity check the raw data, otherwise doom looms for
4067 some malformed UCS-4 data. */
4068 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004069#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004070 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004071#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004072 end-s < Py_UNICODE_SIZE
4073 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004075 startinpos = s - starts;
4076 if (end-s < Py_UNICODE_SIZE) {
4077 endinpos = end-starts;
4078 reason = "truncated input";
4079 }
4080 else {
4081 endinpos = s - starts + Py_UNICODE_SIZE;
4082 reason = "illegal code point (> 0x10FFFF)";
4083 }
4084 outpos = p - PyUnicode_AS_UNICODE(v);
4085 if (unicode_decode_call_errorhandler(
4086 errors, &errorHandler,
4087 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004088 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004089 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004090 goto onError;
4091 }
4092 }
4093 else {
4094 p++;
4095 s += Py_UNICODE_SIZE;
4096 }
4097 }
4098
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004100 goto onError;
4101 Py_XDECREF(errorHandler);
4102 Py_XDECREF(exc);
4103 return (PyObject *)v;
4104
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004106 Py_XDECREF(v);
4107 Py_XDECREF(errorHandler);
4108 Py_XDECREF(exc);
4109 return NULL;
4110}
4111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112/* --- Latin-1 Codec ------------------------------------------------------ */
4113
4114PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 Py_ssize_t size,
4116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117{
4118 PyUnicodeObject *v;
4119 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004120 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004121
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004123 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 Py_UNICODE r = *(unsigned char*)s;
4125 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004126 }
4127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 v = _PyUnicode_New(size);
4129 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004134 e = s + size;
4135 /* Unrolling the copy makes it much faster by reducing the looping
4136 overhead. This is similar to what many memcpy() implementations do. */
4137 unrolled_end = e - 4;
4138 while (s < unrolled_end) {
4139 p[0] = (unsigned char) s[0];
4140 p[1] = (unsigned char) s[1];
4141 p[2] = (unsigned char) s[2];
4142 p[3] = (unsigned char) s[3];
4143 s += 4;
4144 p += 4;
4145 }
4146 while (s < e)
4147 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004149
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 Py_XDECREF(v);
4152 return NULL;
4153}
4154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155/* create or adjust a UnicodeEncodeError */
4156static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 const char *encoding,
4158 const Py_UNICODE *unicode, Py_ssize_t size,
4159 Py_ssize_t startpos, Py_ssize_t endpos,
4160 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 *exceptionObject = PyUnicodeEncodeError_Create(
4164 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 }
4166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4168 goto onError;
4169 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4170 goto onError;
4171 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4172 goto onError;
4173 return;
4174 onError:
4175 Py_DECREF(*exceptionObject);
4176 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 }
4178}
4179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180/* raises a UnicodeEncodeError */
4181static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 const char *encoding,
4183 const Py_UNICODE *unicode, Py_ssize_t size,
4184 Py_ssize_t startpos, Py_ssize_t endpos,
4185 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186{
4187 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191}
4192
4193/* error handling callback helper:
4194 build arguments, call the callback and check the arguments,
4195 put the result into newpos and return the replacement string, which
4196 has to be freed by the caller */
4197static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 PyObject **errorHandler,
4199 const char *encoding, const char *reason,
4200 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4201 Py_ssize_t startpos, Py_ssize_t endpos,
4202 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004204 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205
4206 PyObject *restuple;
4207 PyObject *resunicode;
4208
4209 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 }
4214
4215 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219
4220 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004225 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 Py_DECREF(restuple);
4227 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004229 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 &resunicode, newpos)) {
4231 Py_DECREF(restuple);
4232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004234 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4235 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4236 Py_DECREF(restuple);
4237 return NULL;
4238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004241 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4243 Py_DECREF(restuple);
4244 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004245 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 Py_INCREF(resunicode);
4247 Py_DECREF(restuple);
4248 return resunicode;
4249}
4250
4251static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 Py_ssize_t size,
4253 const char *errors,
4254 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255{
4256 /* output object */
4257 PyObject *res;
4258 /* pointers to the beginning and end+1 of input */
4259 const Py_UNICODE *startp = p;
4260 const Py_UNICODE *endp = p + size;
4261 /* pointer to the beginning of the unencodable characters */
4262 /* const Py_UNICODE *badp = NULL; */
4263 /* pointer into the output */
4264 char *str;
4265 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004267 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4268 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 PyObject *errorHandler = NULL;
4270 PyObject *exc = NULL;
4271 /* the following variable is used for caching string comparisons
4272 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4273 int known_errorHandler = -1;
4274
4275 /* allocate enough for a simple encoding without
4276 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004277 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004278 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004281 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004282 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 ressize = size;
4284
4285 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 /* can we encode this? */
4289 if (c<limit) {
4290 /* no overflow check, because we know that the space is enough */
4291 *str++ = (char)c;
4292 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 else {
4295 Py_ssize_t unicodepos = p-startp;
4296 Py_ssize_t requiredsize;
4297 PyObject *repunicode;
4298 Py_ssize_t repsize;
4299 Py_ssize_t newpos;
4300 Py_ssize_t respos;
4301 Py_UNICODE *uni2;
4302 /* startpos for collecting unencodable chars */
4303 const Py_UNICODE *collstart = p;
4304 const Py_UNICODE *collend = p;
4305 /* find all unecodable characters */
4306 while ((collend < endp) && ((*collend)>=limit))
4307 ++collend;
4308 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4309 if (known_errorHandler==-1) {
4310 if ((errors==NULL) || (!strcmp(errors, "strict")))
4311 known_errorHandler = 1;
4312 else if (!strcmp(errors, "replace"))
4313 known_errorHandler = 2;
4314 else if (!strcmp(errors, "ignore"))
4315 known_errorHandler = 3;
4316 else if (!strcmp(errors, "xmlcharrefreplace"))
4317 known_errorHandler = 4;
4318 else
4319 known_errorHandler = 0;
4320 }
4321 switch (known_errorHandler) {
4322 case 1: /* strict */
4323 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4324 goto onError;
4325 case 2: /* replace */
4326 while (collstart++<collend)
4327 *str++ = '?'; /* fall through */
4328 case 3: /* ignore */
4329 p = collend;
4330 break;
4331 case 4: /* xmlcharrefreplace */
4332 respos = str - PyBytes_AS_STRING(res);
4333 /* determine replacement size (temporarily (mis)uses p) */
4334 for (p = collstart, repsize = 0; p < collend; ++p) {
4335 if (*p<10)
4336 repsize += 2+1+1;
4337 else if (*p<100)
4338 repsize += 2+2+1;
4339 else if (*p<1000)
4340 repsize += 2+3+1;
4341 else if (*p<10000)
4342 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004343#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 else
4345 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004346#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 else if (*p<100000)
4348 repsize += 2+5+1;
4349 else if (*p<1000000)
4350 repsize += 2+6+1;
4351 else
4352 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004353#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 }
4355 requiredsize = respos+repsize+(endp-collend);
4356 if (requiredsize > ressize) {
4357 if (requiredsize<2*ressize)
4358 requiredsize = 2*ressize;
4359 if (_PyBytes_Resize(&res, requiredsize))
4360 goto onError;
4361 str = PyBytes_AS_STRING(res) + respos;
4362 ressize = requiredsize;
4363 }
4364 /* generate replacement (temporarily (mis)uses p) */
4365 for (p = collstart; p < collend; ++p) {
4366 str += sprintf(str, "&#%d;", (int)*p);
4367 }
4368 p = collend;
4369 break;
4370 default:
4371 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4372 encoding, reason, startp, size, &exc,
4373 collstart-startp, collend-startp, &newpos);
4374 if (repunicode == NULL)
4375 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004376 if (PyBytes_Check(repunicode)) {
4377 /* Directly copy bytes result to output. */
4378 repsize = PyBytes_Size(repunicode);
4379 if (repsize > 1) {
4380 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004381 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004382 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4383 Py_DECREF(repunicode);
4384 goto onError;
4385 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004386 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004387 ressize += repsize-1;
4388 }
4389 memcpy(str, PyBytes_AsString(repunicode), repsize);
4390 str += repsize;
4391 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004392 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004393 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004394 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 /* need more space? (at least enough for what we
4396 have+the replacement+the rest of the string, so
4397 we won't have to check space for encodable characters) */
4398 respos = str - PyBytes_AS_STRING(res);
4399 repsize = PyUnicode_GET_SIZE(repunicode);
4400 requiredsize = respos+repsize+(endp-collend);
4401 if (requiredsize > ressize) {
4402 if (requiredsize<2*ressize)
4403 requiredsize = 2*ressize;
4404 if (_PyBytes_Resize(&res, requiredsize)) {
4405 Py_DECREF(repunicode);
4406 goto onError;
4407 }
4408 str = PyBytes_AS_STRING(res) + respos;
4409 ressize = requiredsize;
4410 }
4411 /* check if there is anything unencodable in the replacement
4412 and copy it to the output */
4413 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4414 c = *uni2;
4415 if (c >= limit) {
4416 raise_encode_exception(&exc, encoding, startp, size,
4417 unicodepos, unicodepos+1, reason);
4418 Py_DECREF(repunicode);
4419 goto onError;
4420 }
4421 *str = (char)c;
4422 }
4423 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004424 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004426 }
4427 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004428 /* Resize if we allocated to much */
4429 size = str - PyBytes_AS_STRING(res);
4430 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004431 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004432 if (_PyBytes_Resize(&res, size) < 0)
4433 goto onError;
4434 }
4435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 Py_XDECREF(errorHandler);
4437 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004438 return res;
4439
4440 onError:
4441 Py_XDECREF(res);
4442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
4444 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445}
4446
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 Py_ssize_t size,
4449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452}
4453
4454PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4455{
4456 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 PyErr_BadArgument();
4458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 }
4460 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 PyUnicode_GET_SIZE(unicode),
4462 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463}
4464
4465/* --- 7-bit ASCII Codec -------------------------------------------------- */
4466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 Py_ssize_t size,
4469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 PyUnicodeObject *v;
4473 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004474 Py_ssize_t startinpos;
4475 Py_ssize_t endinpos;
4476 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 const char *e;
4478 PyObject *errorHandler = NULL;
4479 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004480
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004482 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 Py_UNICODE r = *(unsigned char*)s;
4484 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004485 }
Tim Petersced69f82003-09-16 20:30:58 +00004486
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 v = _PyUnicode_New(size);
4488 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 e = s + size;
4494 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 register unsigned char c = (unsigned char)*s;
4496 if (c < 128) {
4497 *p++ = c;
4498 ++s;
4499 }
4500 else {
4501 startinpos = s-starts;
4502 endinpos = startinpos + 1;
4503 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4504 if (unicode_decode_call_errorhandler(
4505 errors, &errorHandler,
4506 "ascii", "ordinal not in range(128)",
4507 &starts, &e, &startinpos, &endinpos, &exc, &s,
4508 &v, &outpos, &p))
4509 goto onError;
4510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004512 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4514 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004518
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_XDECREF(errorHandler);
4522 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 return NULL;
4524}
4525
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 Py_ssize_t size,
4528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531}
4532
4533PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4534{
4535 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 PyErr_BadArgument();
4537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 }
4539 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 PyUnicode_GET_SIZE(unicode),
4541 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542}
4543
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004544#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004545
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004546/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004547
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004548#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549#define NEED_RETRY
4550#endif
4551
4552/* XXX This code is limited to "true" double-byte encodings, as
4553 a) it assumes an incomplete character consists of a single byte, and
4554 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004556
4557static int is_dbcs_lead_byte(const char *s, int offset)
4558{
4559 const char *curr = s + offset;
4560
4561 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 const char *prev = CharPrev(s, curr);
4563 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004564 }
4565 return 0;
4566}
4567
4568/*
4569 * Decode MBCS string into unicode object. If 'final' is set, converts
4570 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4571 */
4572static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 const char *s, /* MBCS string */
4574 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004575 int final,
4576 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004577{
4578 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004579 Py_ssize_t n;
4580 DWORD usize;
4581 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004582
4583 assert(size >= 0);
4584
Victor Stinner554f3f02010-06-16 23:33:54 +00004585 /* check and handle 'errors' arg */
4586 if (errors==NULL || strcmp(errors, "strict")==0)
4587 flags = MB_ERR_INVALID_CHARS;
4588 else if (strcmp(errors, "ignore")==0)
4589 flags = 0;
4590 else {
4591 PyErr_Format(PyExc_ValueError,
4592 "mbcs encoding does not support errors='%s'",
4593 errors);
4594 return -1;
4595 }
4596
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004597 /* Skip trailing lead-byte unless 'final' is set */
4598 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004600
4601 /* First get the size of the result */
4602 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004603 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4604 if (usize==0)
4605 goto mbcs_decode_error;
4606 } else
4607 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004608
4609 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 /* Create unicode object */
4611 *v = _PyUnicode_New(usize);
4612 if (*v == NULL)
4613 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004614 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004615 }
4616 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 /* Extend unicode object */
4618 n = PyUnicode_GET_SIZE(*v);
4619 if (_PyUnicode_Resize(v, n + usize) < 0)
4620 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004621 }
4622
4623 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004624 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004626 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4627 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004629 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004630 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004631
4632mbcs_decode_error:
4633 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4634 we raise a UnicodeDecodeError - else it is a 'generic'
4635 windows error
4636 */
4637 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4638 /* Ideally, we should get reason from FormatMessage - this
4639 is the Windows 2000 English version of the message
4640 */
4641 PyObject *exc = NULL;
4642 const char *reason = "No mapping for the Unicode character exists "
4643 "in the target multi-byte code page.";
4644 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4645 if (exc != NULL) {
4646 PyCodec_StrictErrors(exc);
4647 Py_DECREF(exc);
4648 }
4649 } else {
4650 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4651 }
4652 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653}
4654
4655PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 Py_ssize_t size,
4657 const char *errors,
4658 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659{
4660 PyUnicodeObject *v = NULL;
4661 int done;
4662
4663 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004665
4666#ifdef NEED_RETRY
4667 retry:
4668 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004669 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004670 else
4671#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004672 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004673
4674 if (done < 0) {
4675 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004677 }
4678
4679 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004681
4682#ifdef NEED_RETRY
4683 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 s += done;
4685 size -= done;
4686 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 }
4688#endif
4689
4690 return (PyObject *)v;
4691}
4692
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004693PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 Py_ssize_t size,
4695 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004696{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4698}
4699
4700/*
4701 * Convert unicode into string object (MBCS).
4702 * Returns 0 if succeed, -1 otherwise.
4703 */
4704static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004706 int size, /* size of unicode */
4707 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004708{
Victor Stinner554f3f02010-06-16 23:33:54 +00004709 BOOL usedDefaultChar = FALSE;
4710 BOOL *pusedDefaultChar;
4711 int mbcssize;
4712 Py_ssize_t n;
4713 PyObject *exc = NULL;
4714 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004715
4716 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004717
Victor Stinner554f3f02010-06-16 23:33:54 +00004718 /* check and handle 'errors' arg */
4719 if (errors==NULL || strcmp(errors, "strict")==0) {
4720 flags = WC_NO_BEST_FIT_CHARS;
4721 pusedDefaultChar = &usedDefaultChar;
4722 } else if (strcmp(errors, "replace")==0) {
4723 flags = 0;
4724 pusedDefaultChar = NULL;
4725 } else {
4726 PyErr_Format(PyExc_ValueError,
4727 "mbcs encoding does not support errors='%s'",
4728 errors);
4729 return -1;
4730 }
4731
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004732 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004733 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004734 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4735 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 if (mbcssize == 0) {
4737 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4738 return -1;
4739 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004740 /* If we used a default char, then we failed! */
4741 if (pusedDefaultChar && *pusedDefaultChar)
4742 goto mbcs_encode_error;
4743 } else {
4744 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004745 }
4746
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004747 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 /* Create string object */
4749 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4750 if (*repr == NULL)
4751 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004752 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004753 }
4754 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 /* Extend string object */
4756 n = PyBytes_Size(*repr);
4757 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4758 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004759 }
4760
4761 /* Do the conversion */
4762 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004764 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4765 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4767 return -1;
4768 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004769 if (pusedDefaultChar && *pusedDefaultChar)
4770 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004771 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004772 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004773
4774mbcs_encode_error:
4775 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4776 Py_XDECREF(exc);
4777 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004778}
4779
4780PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 Py_ssize_t size,
4782 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004783{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004784 PyObject *repr = NULL;
4785 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004786
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004787#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004789 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004790 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004791 else
4792#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004793 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004794
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004795 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 Py_XDECREF(repr);
4797 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004798 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004799
4800#ifdef NEED_RETRY
4801 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 p += INT_MAX;
4803 size -= INT_MAX;
4804 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004805 }
4806#endif
4807
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004808 return repr;
4809}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004810
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004811PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4812{
4813 if (!PyUnicode_Check(unicode)) {
4814 PyErr_BadArgument();
4815 return NULL;
4816 }
4817 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 PyUnicode_GET_SIZE(unicode),
4819 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004820}
4821
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004822#undef NEED_RETRY
4823
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004824#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004825
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826/* --- Character Mapping Codec -------------------------------------------- */
4827
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 Py_ssize_t size,
4830 PyObject *mapping,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 Py_ssize_t startinpos;
4835 Py_ssize_t endinpos;
4836 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 PyUnicodeObject *v;
4839 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004840 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 PyObject *errorHandler = NULL;
4842 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004843 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 /* Default to Latin-1 */
4847 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
4850 v = _PyUnicode_New(size);
4851 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004857 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 mapstring = PyUnicode_AS_UNICODE(mapping);
4859 maplen = PyUnicode_GET_SIZE(mapping);
4860 while (s < e) {
4861 unsigned char ch = *s;
4862 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 if (ch < maplen)
4865 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 if (x == 0xfffe) {
4868 /* undefined mapping */
4869 outpos = p-PyUnicode_AS_UNICODE(v);
4870 startinpos = s-starts;
4871 endinpos = startinpos+1;
4872 if (unicode_decode_call_errorhandler(
4873 errors, &errorHandler,
4874 "charmap", "character maps to <undefined>",
4875 &starts, &e, &startinpos, &endinpos, &exc, &s,
4876 &v, &outpos, &p)) {
4877 goto onError;
4878 }
4879 continue;
4880 }
4881 *p++ = x;
4882 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004883 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004884 }
4885 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 while (s < e) {
4887 unsigned char ch = *s;
4888 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004889
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4891 w = PyLong_FromLong((long)ch);
4892 if (w == NULL)
4893 goto onError;
4894 x = PyObject_GetItem(mapping, w);
4895 Py_DECREF(w);
4896 if (x == NULL) {
4897 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4898 /* No mapping found means: mapping is undefined. */
4899 PyErr_Clear();
4900 x = Py_None;
4901 Py_INCREF(x);
4902 } else
4903 goto onError;
4904 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004905
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 /* Apply mapping */
4907 if (PyLong_Check(x)) {
4908 long value = PyLong_AS_LONG(x);
4909 if (value < 0 || value > 65535) {
4910 PyErr_SetString(PyExc_TypeError,
4911 "character mapping must be in range(65536)");
4912 Py_DECREF(x);
4913 goto onError;
4914 }
4915 *p++ = (Py_UNICODE)value;
4916 }
4917 else if (x == Py_None) {
4918 /* undefined mapping */
4919 outpos = p-PyUnicode_AS_UNICODE(v);
4920 startinpos = s-starts;
4921 endinpos = startinpos+1;
4922 if (unicode_decode_call_errorhandler(
4923 errors, &errorHandler,
4924 "charmap", "character maps to <undefined>",
4925 &starts, &e, &startinpos, &endinpos, &exc, &s,
4926 &v, &outpos, &p)) {
4927 Py_DECREF(x);
4928 goto onError;
4929 }
4930 Py_DECREF(x);
4931 continue;
4932 }
4933 else if (PyUnicode_Check(x)) {
4934 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004935
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 if (targetsize == 1)
4937 /* 1-1 mapping */
4938 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004939
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 else if (targetsize > 1) {
4941 /* 1-n mapping */
4942 if (targetsize > extrachars) {
4943 /* resize first */
4944 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4945 Py_ssize_t needed = (targetsize - extrachars) + \
4946 (targetsize << 2);
4947 extrachars += needed;
4948 /* XXX overflow detection missing */
4949 if (_PyUnicode_Resize(&v,
4950 PyUnicode_GET_SIZE(v) + needed) < 0) {
4951 Py_DECREF(x);
4952 goto onError;
4953 }
4954 p = PyUnicode_AS_UNICODE(v) + oldpos;
4955 }
4956 Py_UNICODE_COPY(p,
4957 PyUnicode_AS_UNICODE(x),
4958 targetsize);
4959 p += targetsize;
4960 extrachars -= targetsize;
4961 }
4962 /* 1-0 mapping: skip the character */
4963 }
4964 else {
4965 /* wrong return value */
4966 PyErr_SetString(PyExc_TypeError,
4967 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004968 Py_DECREF(x);
4969 goto onError;
4970 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 Py_DECREF(x);
4972 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 }
4975 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 Py_XDECREF(errorHandler);
4979 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004981
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 Py_XDECREF(errorHandler);
4984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 Py_XDECREF(v);
4986 return NULL;
4987}
4988
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004989/* Charmap encoding: the lookup table */
4990
4991struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 PyObject_HEAD
4993 unsigned char level1[32];
4994 int count2, count3;
4995 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004996};
4997
4998static PyObject*
4999encoding_map_size(PyObject *obj, PyObject* args)
5000{
5001 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005002 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005004}
5005
5006static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005007 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 PyDoc_STR("Return the size (in bytes) of this object") },
5009 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005010};
5011
5012static void
5013encoding_map_dealloc(PyObject* o)
5014{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005015 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005016}
5017
5018static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 "EncodingMap", /*tp_name*/
5021 sizeof(struct encoding_map), /*tp_basicsize*/
5022 0, /*tp_itemsize*/
5023 /* methods */
5024 encoding_map_dealloc, /*tp_dealloc*/
5025 0, /*tp_print*/
5026 0, /*tp_getattr*/
5027 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005028 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 0, /*tp_repr*/
5030 0, /*tp_as_number*/
5031 0, /*tp_as_sequence*/
5032 0, /*tp_as_mapping*/
5033 0, /*tp_hash*/
5034 0, /*tp_call*/
5035 0, /*tp_str*/
5036 0, /*tp_getattro*/
5037 0, /*tp_setattro*/
5038 0, /*tp_as_buffer*/
5039 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5040 0, /*tp_doc*/
5041 0, /*tp_traverse*/
5042 0, /*tp_clear*/
5043 0, /*tp_richcompare*/
5044 0, /*tp_weaklistoffset*/
5045 0, /*tp_iter*/
5046 0, /*tp_iternext*/
5047 encoding_map_methods, /*tp_methods*/
5048 0, /*tp_members*/
5049 0, /*tp_getset*/
5050 0, /*tp_base*/
5051 0, /*tp_dict*/
5052 0, /*tp_descr_get*/
5053 0, /*tp_descr_set*/
5054 0, /*tp_dictoffset*/
5055 0, /*tp_init*/
5056 0, /*tp_alloc*/
5057 0, /*tp_new*/
5058 0, /*tp_free*/
5059 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005060};
5061
5062PyObject*
5063PyUnicode_BuildEncodingMap(PyObject* string)
5064{
5065 Py_UNICODE *decode;
5066 PyObject *result;
5067 struct encoding_map *mresult;
5068 int i;
5069 int need_dict = 0;
5070 unsigned char level1[32];
5071 unsigned char level2[512];
5072 unsigned char *mlevel1, *mlevel2, *mlevel3;
5073 int count2 = 0, count3 = 0;
5074
5075 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5076 PyErr_BadArgument();
5077 return NULL;
5078 }
5079 decode = PyUnicode_AS_UNICODE(string);
5080 memset(level1, 0xFF, sizeof level1);
5081 memset(level2, 0xFF, sizeof level2);
5082
5083 /* If there isn't a one-to-one mapping of NULL to \0,
5084 or if there are non-BMP characters, we need to use
5085 a mapping dictionary. */
5086 if (decode[0] != 0)
5087 need_dict = 1;
5088 for (i = 1; i < 256; i++) {
5089 int l1, l2;
5090 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005091#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005092 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005093#endif
5094 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005095 need_dict = 1;
5096 break;
5097 }
5098 if (decode[i] == 0xFFFE)
5099 /* unmapped character */
5100 continue;
5101 l1 = decode[i] >> 11;
5102 l2 = decode[i] >> 7;
5103 if (level1[l1] == 0xFF)
5104 level1[l1] = count2++;
5105 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005106 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005107 }
5108
5109 if (count2 >= 0xFF || count3 >= 0xFF)
5110 need_dict = 1;
5111
5112 if (need_dict) {
5113 PyObject *result = PyDict_New();
5114 PyObject *key, *value;
5115 if (!result)
5116 return NULL;
5117 for (i = 0; i < 256; i++) {
5118 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005119 key = PyLong_FromLong(decode[i]);
5120 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005121 if (!key || !value)
5122 goto failed1;
5123 if (PyDict_SetItem(result, key, value) == -1)
5124 goto failed1;
5125 Py_DECREF(key);
5126 Py_DECREF(value);
5127 }
5128 return result;
5129 failed1:
5130 Py_XDECREF(key);
5131 Py_XDECREF(value);
5132 Py_DECREF(result);
5133 return NULL;
5134 }
5135
5136 /* Create a three-level trie */
5137 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5138 16*count2 + 128*count3 - 1);
5139 if (!result)
5140 return PyErr_NoMemory();
5141 PyObject_Init(result, &EncodingMapType);
5142 mresult = (struct encoding_map*)result;
5143 mresult->count2 = count2;
5144 mresult->count3 = count3;
5145 mlevel1 = mresult->level1;
5146 mlevel2 = mresult->level23;
5147 mlevel3 = mresult->level23 + 16*count2;
5148 memcpy(mlevel1, level1, 32);
5149 memset(mlevel2, 0xFF, 16*count2);
5150 memset(mlevel3, 0, 128*count3);
5151 count3 = 0;
5152 for (i = 1; i < 256; i++) {
5153 int o1, o2, o3, i2, i3;
5154 if (decode[i] == 0xFFFE)
5155 /* unmapped character */
5156 continue;
5157 o1 = decode[i]>>11;
5158 o2 = (decode[i]>>7) & 0xF;
5159 i2 = 16*mlevel1[o1] + o2;
5160 if (mlevel2[i2] == 0xFF)
5161 mlevel2[i2] = count3++;
5162 o3 = decode[i] & 0x7F;
5163 i3 = 128*mlevel2[i2] + o3;
5164 mlevel3[i3] = i;
5165 }
5166 return result;
5167}
5168
5169static int
5170encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5171{
5172 struct encoding_map *map = (struct encoding_map*)mapping;
5173 int l1 = c>>11;
5174 int l2 = (c>>7) & 0xF;
5175 int l3 = c & 0x7F;
5176 int i;
5177
5178#ifdef Py_UNICODE_WIDE
5179 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005181 }
5182#endif
5183 if (c == 0)
5184 return 0;
5185 /* level 1*/
5186 i = map->level1[l1];
5187 if (i == 0xFF) {
5188 return -1;
5189 }
5190 /* level 2*/
5191 i = map->level23[16*i+l2];
5192 if (i == 0xFF) {
5193 return -1;
5194 }
5195 /* level 3 */
5196 i = map->level23[16*map->count2 + 128*i + l3];
5197 if (i == 0) {
5198 return -1;
5199 }
5200 return i;
5201}
5202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203/* Lookup the character ch in the mapping. If the character
5204 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005205 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
Christian Heimes217cfd12007-12-02 14:31:20 +00005208 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 PyObject *x;
5210
5211 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 x = PyObject_GetItem(mapping, w);
5214 Py_DECREF(w);
5215 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5217 /* No mapping found means: mapping is undefined. */
5218 PyErr_Clear();
5219 x = Py_None;
5220 Py_INCREF(x);
5221 return x;
5222 } else
5223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005225 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005227 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 long value = PyLong_AS_LONG(x);
5229 if (value < 0 || value > 255) {
5230 PyErr_SetString(PyExc_TypeError,
5231 "character mapping must be in range(256)");
5232 Py_DECREF(x);
5233 return NULL;
5234 }
5235 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005237 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 /* wrong return value */
5241 PyErr_Format(PyExc_TypeError,
5242 "character mapping must return integer, bytes or None, not %.400s",
5243 x->ob_type->tp_name);
5244 Py_DECREF(x);
5245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 }
5247}
5248
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005249static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005250charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005252 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5253 /* exponentially overallocate to minimize reallocations */
5254 if (requiredsize < 2*outsize)
5255 requiredsize = 2*outsize;
5256 if (_PyBytes_Resize(outobj, requiredsize))
5257 return -1;
5258 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005259}
5260
Benjamin Peterson14339b62009-01-31 16:36:08 +00005261typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005263}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005265 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 space is available. Return a new reference to the object that
5267 was put in the output buffer, or Py_None, if the mapping was undefined
5268 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005269 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005271charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005273{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005274 PyObject *rep;
5275 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005276 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277
Christian Heimes90aa7642007-12-19 02:45:37 +00005278 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005279 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005281 if (res == -1)
5282 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 if (outsize<requiredsize)
5284 if (charmapencode_resize(outobj, outpos, requiredsize))
5285 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005286 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 outstart[(*outpos)++] = (char)res;
5288 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005289 }
5290
5291 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005292 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005294 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 Py_DECREF(rep);
5296 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005297 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 if (PyLong_Check(rep)) {
5299 Py_ssize_t requiredsize = *outpos+1;
5300 if (outsize<requiredsize)
5301 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5302 Py_DECREF(rep);
5303 return enc_EXCEPTION;
5304 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005305 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 else {
5309 const char *repchars = PyBytes_AS_STRING(rep);
5310 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5311 Py_ssize_t requiredsize = *outpos+repsize;
5312 if (outsize<requiredsize)
5313 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5314 Py_DECREF(rep);
5315 return enc_EXCEPTION;
5316 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005317 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 memcpy(outstart + *outpos, repchars, repsize);
5319 *outpos += repsize;
5320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005322 Py_DECREF(rep);
5323 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324}
5325
5326/* handle an error in PyUnicode_EncodeCharmap
5327 Return 0 on success, -1 on error */
5328static
5329int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005332 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005333 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334{
5335 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t repsize;
5337 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 Py_UNICODE *uni2;
5339 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005340 Py_ssize_t collstartpos = *inpos;
5341 Py_ssize_t collendpos = *inpos+1;
5342 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005343 char *encoding = "charmap";
5344 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005345 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 /* find all unencodable characters */
5348 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005349 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005350 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 int res = encoding_map_lookup(p[collendpos], mapping);
5352 if (res != -1)
5353 break;
5354 ++collendpos;
5355 continue;
5356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 rep = charmapencode_lookup(p[collendpos], mapping);
5359 if (rep==NULL)
5360 return -1;
5361 else if (rep!=Py_None) {
5362 Py_DECREF(rep);
5363 break;
5364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005365 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 }
5368 /* cache callback name lookup
5369 * (if not done yet, i.e. it's the first error) */
5370 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 if ((errors==NULL) || (!strcmp(errors, "strict")))
5372 *known_errorHandler = 1;
5373 else if (!strcmp(errors, "replace"))
5374 *known_errorHandler = 2;
5375 else if (!strcmp(errors, "ignore"))
5376 *known_errorHandler = 3;
5377 else if (!strcmp(errors, "xmlcharrefreplace"))
5378 *known_errorHandler = 4;
5379 else
5380 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381 }
5382 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005383 case 1: /* strict */
5384 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5385 return -1;
5386 case 2: /* replace */
5387 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 x = charmapencode_output('?', mapping, res, respos);
5389 if (x==enc_EXCEPTION) {
5390 return -1;
5391 }
5392 else if (x==enc_FAILED) {
5393 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5394 return -1;
5395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 }
5397 /* fall through */
5398 case 3: /* ignore */
5399 *inpos = collendpos;
5400 break;
5401 case 4: /* xmlcharrefreplace */
5402 /* generate replacement (temporarily (mis)uses p) */
5403 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 char buffer[2+29+1+1];
5405 char *cp;
5406 sprintf(buffer, "&#%d;", (int)p[collpos]);
5407 for (cp = buffer; *cp; ++cp) {
5408 x = charmapencode_output(*cp, mapping, res, respos);
5409 if (x==enc_EXCEPTION)
5410 return -1;
5411 else if (x==enc_FAILED) {
5412 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5413 return -1;
5414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 }
5416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005417 *inpos = collendpos;
5418 break;
5419 default:
5420 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 encoding, reason, p, size, exceptionObject,
5422 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005423 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005425 if (PyBytes_Check(repunicode)) {
5426 /* Directly copy bytes result to output. */
5427 Py_ssize_t outsize = PyBytes_Size(*res);
5428 Py_ssize_t requiredsize;
5429 repsize = PyBytes_Size(repunicode);
5430 requiredsize = *respos + repsize;
5431 if (requiredsize > outsize)
5432 /* Make room for all additional bytes. */
5433 if (charmapencode_resize(res, respos, requiredsize)) {
5434 Py_DECREF(repunicode);
5435 return -1;
5436 }
5437 memcpy(PyBytes_AsString(*res) + *respos,
5438 PyBytes_AsString(repunicode), repsize);
5439 *respos += repsize;
5440 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005441 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005442 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005444 /* generate replacement */
5445 repsize = PyUnicode_GET_SIZE(repunicode);
5446 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 x = charmapencode_output(*uni2, mapping, res, respos);
5448 if (x==enc_EXCEPTION) {
5449 return -1;
5450 }
5451 else if (x==enc_FAILED) {
5452 Py_DECREF(repunicode);
5453 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5454 return -1;
5455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005456 }
5457 *inpos = newpos;
5458 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 }
5460 return 0;
5461}
5462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 Py_ssize_t size,
5465 PyObject *mapping,
5466 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 /* output object */
5469 PyObject *res = NULL;
5470 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005471 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005473 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 PyObject *errorHandler = NULL;
5475 PyObject *exc = NULL;
5476 /* the following variable is used for caching string comparisons
5477 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5478 * 3=ignore, 4=xmlcharrefreplace */
5479 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
5481 /* Default to Latin-1 */
5482 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 /* allocate enough for a simple encoding without
5486 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005487 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 if (res == NULL)
5489 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005490 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 /* try to encode it */
5495 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5496 if (x==enc_EXCEPTION) /* error */
5497 goto onError;
5498 if (x==enc_FAILED) { /* unencodable character */
5499 if (charmap_encoding_error(p, size, &inpos, mapping,
5500 &exc,
5501 &known_errorHandler, &errorHandler, errors,
5502 &res, &respos)) {
5503 goto onError;
5504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005505 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 else
5507 /* done with this character => adjust input position */
5508 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005512 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 if (_PyBytes_Resize(&res, respos) < 0)
5514 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 Py_XDECREF(exc);
5517 Py_XDECREF(errorHandler);
5518 return res;
5519
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 Py_XDECREF(res);
5522 Py_XDECREF(exc);
5523 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 return NULL;
5525}
5526
5527PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529{
5530 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 PyErr_BadArgument();
5532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 }
5534 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 PyUnicode_GET_SIZE(unicode),
5536 mapping,
5537 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538}
5539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540/* create or adjust a UnicodeTranslateError */
5541static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 const Py_UNICODE *unicode, Py_ssize_t size,
5543 Py_ssize_t startpos, Py_ssize_t endpos,
5544 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005547 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
5550 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5552 goto onError;
5553 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5554 goto onError;
5555 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5556 goto onError;
5557 return;
5558 onError:
5559 Py_DECREF(*exceptionObject);
5560 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 }
5562}
5563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564/* raises a UnicodeTranslateError */
5565static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 const Py_UNICODE *unicode, Py_ssize_t size,
5567 Py_ssize_t startpos, Py_ssize_t endpos,
5568 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569{
5570 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574}
5575
5576/* error handling callback helper:
5577 build arguments, call the callback and check the arguments,
5578 put the result into newpos and return the replacement string, which
5579 has to be freed by the caller */
5580static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 PyObject **errorHandler,
5582 const char *reason,
5583 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5584 Py_ssize_t startpos, Py_ssize_t endpos,
5585 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005587 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005589 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 PyObject *restuple;
5591 PyObject *resunicode;
5592
5593 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 }
5598
5599 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005601 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603
5604 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005609 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 Py_DECREF(restuple);
5611 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 }
5613 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 &resunicode, &i_newpos)) {
5615 Py_DECREF(restuple);
5616 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005618 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005620 else
5621 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005622 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5624 Py_DECREF(restuple);
5625 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 Py_INCREF(resunicode);
5628 Py_DECREF(restuple);
5629 return resunicode;
5630}
5631
5632/* Lookup the character ch in the mapping and put the result in result,
5633 which must be decrefed by the caller.
5634 Return 0 on success, -1 on error */
5635static
5636int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5637{
Christian Heimes217cfd12007-12-02 14:31:20 +00005638 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 PyObject *x;
5640
5641 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 x = PyObject_GetItem(mapping, w);
5644 Py_DECREF(w);
5645 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5647 /* No mapping found means: use 1:1 mapping. */
5648 PyErr_Clear();
5649 *result = NULL;
5650 return 0;
5651 } else
5652 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 }
5654 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 *result = x;
5656 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005657 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005658 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 long value = PyLong_AS_LONG(x);
5660 long max = PyUnicode_GetMax();
5661 if (value < 0 || value > max) {
5662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005663 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 Py_DECREF(x);
5665 return -1;
5666 }
5667 *result = x;
5668 return 0;
5669 }
5670 else if (PyUnicode_Check(x)) {
5671 *result = x;
5672 return 0;
5673 }
5674 else {
5675 /* wrong return value */
5676 PyErr_SetString(PyExc_TypeError,
5677 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005678 Py_DECREF(x);
5679 return -1;
5680 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681}
5682/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 if not reallocate and adjust various state variables.
5684 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685static
Walter Dörwald4894c302003-10-24 14:25:28 +00005686int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005689 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005690 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 /* remember old output position */
5692 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5693 /* exponentially overallocate to minimize reallocations */
5694 if (requiredsize < 2 * oldsize)
5695 requiredsize = 2 * oldsize;
5696 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5697 return -1;
5698 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 }
5700 return 0;
5701}
5702/* lookup the character, put the result in the output string and adjust
5703 various state variables. Return a new reference to the object that
5704 was put in the output buffer in *result, or Py_None, if the mapping was
5705 undefined (in which case no character was written).
5706 The called must decref result.
5707 Return 0 on success, -1 on error. */
5708static
Walter Dörwald4894c302003-10-24 14:25:28 +00005709int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5711 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712{
Walter Dörwald4894c302003-10-24 14:25:28 +00005713 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* not found => default to 1:1 mapping */
5717 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 }
5719 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005721 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 /* no overflow check, because we know that the space is enough */
5723 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 }
5725 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5727 if (repsize==1) {
5728 /* no overflow check, because we know that the space is enough */
5729 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5730 }
5731 else if (repsize!=0) {
5732 /* more than one character */
5733 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5734 (insize - (curinp-startinp)) +
5735 repsize - 1;
5736 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5737 return -1;
5738 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5739 *outp += repsize;
5740 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 }
5742 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 return 0;
5745}
5746
5747PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 Py_ssize_t size,
5749 PyObject *mapping,
5750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 /* output object */
5753 PyObject *res = NULL;
5754 /* pointers to the beginning and end+1 of input */
5755 const Py_UNICODE *startp = p;
5756 const Py_UNICODE *endp = p + size;
5757 /* pointer into the output */
5758 Py_UNICODE *str;
5759 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 char *reason = "character maps to <undefined>";
5762 PyObject *errorHandler = NULL;
5763 PyObject *exc = NULL;
5764 /* the following variable is used for caching string comparisons
5765 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5766 * 3=ignore, 4=xmlcharrefreplace */
5767 int known_errorHandler = -1;
5768
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 PyErr_BadArgument();
5771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773
5774 /* allocate enough for a simple 1:1 translation without
5775 replacements, if we need more, we'll resize */
5776 res = PyUnicode_FromUnicode(NULL, size);
5777 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 /* try to encode it */
5785 PyObject *x = NULL;
5786 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5787 Py_XDECREF(x);
5788 goto onError;
5789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 if (x!=Py_None) /* it worked => adjust input pointer */
5792 ++p;
5793 else { /* untranslatable character */
5794 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5795 Py_ssize_t repsize;
5796 Py_ssize_t newpos;
5797 Py_UNICODE *uni2;
5798 /* startpos for collecting untranslatable chars */
5799 const Py_UNICODE *collstart = p;
5800 const Py_UNICODE *collend = p+1;
5801 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 /* find all untranslatable characters */
5804 while (collend < endp) {
5805 if (charmaptranslate_lookup(*collend, mapping, &x))
5806 goto onError;
5807 Py_XDECREF(x);
5808 if (x!=Py_None)
5809 break;
5810 ++collend;
5811 }
5812 /* cache callback name lookup
5813 * (if not done yet, i.e. it's the first error) */
5814 if (known_errorHandler==-1) {
5815 if ((errors==NULL) || (!strcmp(errors, "strict")))
5816 known_errorHandler = 1;
5817 else if (!strcmp(errors, "replace"))
5818 known_errorHandler = 2;
5819 else if (!strcmp(errors, "ignore"))
5820 known_errorHandler = 3;
5821 else if (!strcmp(errors, "xmlcharrefreplace"))
5822 known_errorHandler = 4;
5823 else
5824 known_errorHandler = 0;
5825 }
5826 switch (known_errorHandler) {
5827 case 1: /* strict */
5828 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005829 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 case 2: /* replace */
5831 /* No need to check for space, this is a 1:1 replacement */
5832 for (coll = collstart; coll<collend; ++coll)
5833 *str++ = '?';
5834 /* fall through */
5835 case 3: /* ignore */
5836 p = collend;
5837 break;
5838 case 4: /* xmlcharrefreplace */
5839 /* generate replacement (temporarily (mis)uses p) */
5840 for (p = collstart; p < collend; ++p) {
5841 char buffer[2+29+1+1];
5842 char *cp;
5843 sprintf(buffer, "&#%d;", (int)*p);
5844 if (charmaptranslate_makespace(&res, &str,
5845 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5846 goto onError;
5847 for (cp = buffer; *cp; ++cp)
5848 *str++ = *cp;
5849 }
5850 p = collend;
5851 break;
5852 default:
5853 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5854 reason, startp, size, &exc,
5855 collstart-startp, collend-startp, &newpos);
5856 if (repunicode == NULL)
5857 goto onError;
5858 /* generate replacement */
5859 repsize = PyUnicode_GET_SIZE(repunicode);
5860 if (charmaptranslate_makespace(&res, &str,
5861 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5862 Py_DECREF(repunicode);
5863 goto onError;
5864 }
5865 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5866 *str++ = *uni2;
5867 p = startp + newpos;
5868 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005870 }
5871 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 /* Resize if we allocated to much */
5873 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005874 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 if (PyUnicode_Resize(&res, respos) < 0)
5876 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 }
5878 Py_XDECREF(exc);
5879 Py_XDECREF(errorHandler);
5880 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 Py_XDECREF(res);
5884 Py_XDECREF(exc);
5885 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 return NULL;
5887}
5888
5889PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 PyObject *mapping,
5891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
5893 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 str = PyUnicode_FromObject(str);
5896 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 PyUnicode_GET_SIZE(str),
5900 mapping,
5901 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 Py_DECREF(str);
5903 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005904
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 Py_XDECREF(str);
5907 return NULL;
5908}
Tim Petersced69f82003-09-16 20:30:58 +00005909
Guido van Rossum9e896b32000-04-05 20:11:21 +00005910/* --- Decimal Encoder ---------------------------------------------------- */
5911
5912int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 Py_ssize_t length,
5914 char *output,
5915 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005916{
5917 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 PyObject *errorHandler = NULL;
5919 PyObject *exc = NULL;
5920 const char *encoding = "decimal";
5921 const char *reason = "invalid decimal Unicode string";
5922 /* the following variable is used for caching string comparisons
5923 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5924 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005925
5926 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyErr_BadArgument();
5928 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005929 }
5930
5931 p = s;
5932 end = s + length;
5933 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 register Py_UNICODE ch = *p;
5935 int decimal;
5936 PyObject *repunicode;
5937 Py_ssize_t repsize;
5938 Py_ssize_t newpos;
5939 Py_UNICODE *uni2;
5940 Py_UNICODE *collstart;
5941 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005942
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005944 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 ++p;
5946 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005947 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 decimal = Py_UNICODE_TODECIMAL(ch);
5949 if (decimal >= 0) {
5950 *output++ = '0' + decimal;
5951 ++p;
5952 continue;
5953 }
5954 if (0 < ch && ch < 256) {
5955 *output++ = (char)ch;
5956 ++p;
5957 continue;
5958 }
5959 /* All other characters are considered unencodable */
5960 collstart = p;
5961 collend = p+1;
5962 while (collend < end) {
5963 if ((0 < *collend && *collend < 256) ||
5964 !Py_UNICODE_ISSPACE(*collend) ||
5965 Py_UNICODE_TODECIMAL(*collend))
5966 break;
5967 }
5968 /* cache callback name lookup
5969 * (if not done yet, i.e. it's the first error) */
5970 if (known_errorHandler==-1) {
5971 if ((errors==NULL) || (!strcmp(errors, "strict")))
5972 known_errorHandler = 1;
5973 else if (!strcmp(errors, "replace"))
5974 known_errorHandler = 2;
5975 else if (!strcmp(errors, "ignore"))
5976 known_errorHandler = 3;
5977 else if (!strcmp(errors, "xmlcharrefreplace"))
5978 known_errorHandler = 4;
5979 else
5980 known_errorHandler = 0;
5981 }
5982 switch (known_errorHandler) {
5983 case 1: /* strict */
5984 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5985 goto onError;
5986 case 2: /* replace */
5987 for (p = collstart; p < collend; ++p)
5988 *output++ = '?';
5989 /* fall through */
5990 case 3: /* ignore */
5991 p = collend;
5992 break;
5993 case 4: /* xmlcharrefreplace */
5994 /* generate replacement (temporarily (mis)uses p) */
5995 for (p = collstart; p < collend; ++p)
5996 output += sprintf(output, "&#%d;", (int)*p);
5997 p = collend;
5998 break;
5999 default:
6000 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6001 encoding, reason, s, length, &exc,
6002 collstart-s, collend-s, &newpos);
6003 if (repunicode == NULL)
6004 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006005 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006006 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006007 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6008 Py_DECREF(repunicode);
6009 goto onError;
6010 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* generate replacement */
6012 repsize = PyUnicode_GET_SIZE(repunicode);
6013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6014 Py_UNICODE ch = *uni2;
6015 if (Py_UNICODE_ISSPACE(ch))
6016 *output++ = ' ';
6017 else {
6018 decimal = Py_UNICODE_TODECIMAL(ch);
6019 if (decimal >= 0)
6020 *output++ = '0' + decimal;
6021 else if (0 < ch && ch < 256)
6022 *output++ = (char)ch;
6023 else {
6024 Py_DECREF(repunicode);
6025 raise_encode_exception(&exc, encoding,
6026 s, length, collstart-s, collend-s, reason);
6027 goto onError;
6028 }
6029 }
6030 }
6031 p = s + newpos;
6032 Py_DECREF(repunicode);
6033 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006034 }
6035 /* 0-terminate the output string */
6036 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 Py_XDECREF(exc);
6038 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006039 return 0;
6040
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 Py_XDECREF(exc);
6043 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006044 return -1;
6045}
6046
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047/* --- Helpers ------------------------------------------------------------ */
6048
Eric Smith8c663262007-08-25 02:26:07 +00006049#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006050#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006051
Thomas Wouters477c8d52006-05-27 19:21:47 +00006052#include "stringlib/count.h"
6053#include "stringlib/find.h"
6054#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006055#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006056
Eric Smith5807c412008-05-11 21:00:57 +00006057#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006058#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006059#include "stringlib/localeutil.h"
6060
Thomas Wouters477c8d52006-05-27 19:21:47 +00006061/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006062#define ADJUST_INDICES(start, end, len) \
6063 if (end > len) \
6064 end = len; \
6065 else if (end < 0) { \
6066 end += len; \
6067 if (end < 0) \
6068 end = 0; \
6069 } \
6070 if (start < 0) { \
6071 start += len; \
6072 if (start < 0) \
6073 start = 0; \
6074 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006075
Martin v. Löwis18e16552006-02-15 17:27:45 +00006076Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006077 PyObject *substr,
6078 Py_ssize_t start,
6079 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006082 PyUnicodeObject* str_obj;
6083 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006084
Thomas Wouters477c8d52006-05-27 19:21:47 +00006085 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6086 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006088 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6089 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 Py_DECREF(str_obj);
6091 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 }
Tim Petersced69f82003-09-16 20:30:58 +00006093
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006094 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006095 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006096 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6097 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006098 );
6099
6100 Py_DECREF(sub_obj);
6101 Py_DECREF(str_obj);
6102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 return result;
6104}
6105
Martin v. Löwis18e16552006-02-15 17:27:45 +00006106Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006107 PyObject *sub,
6108 Py_ssize_t start,
6109 Py_ssize_t end,
6110 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006113
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006117 sub = PyUnicode_FromObject(sub);
6118 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 Py_DECREF(str);
6120 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 }
Tim Petersced69f82003-09-16 20:30:58 +00006122
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123 if (direction > 0)
6124 result = stringlib_find_slice(
6125 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6126 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6127 start, end
6128 );
6129 else
6130 result = stringlib_rfind_slice(
6131 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6132 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6133 start, end
6134 );
6135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006137 Py_DECREF(sub);
6138
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return result;
6140}
6141
Tim Petersced69f82003-09-16 20:30:58 +00006142static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 PyUnicodeObject *substring,
6145 Py_ssize_t start,
6146 Py_ssize_t end,
6147 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 if (substring->length == 0)
6150 return 1;
6151
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006152 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 end -= substring->length;
6154 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 if (Py_UNICODE_MATCH(self, end, substring))
6159 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 } else {
6161 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
6164
6165 return 0;
6166}
6167
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 PyObject *substr,
6170 Py_ssize_t start,
6171 Py_ssize_t end,
6172 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006174 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 str = PyUnicode_FromObject(str);
6177 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 substr = PyUnicode_FromObject(substr);
6180 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 Py_DECREF(str);
6182 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 }
Tim Petersced69f82003-09-16 20:30:58 +00006184
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 (PyUnicodeObject *)substr,
6187 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 Py_DECREF(str);
6189 Py_DECREF(substr);
6190 return result;
6191}
6192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193/* Apply fixfct filter to the Unicode object self and return a
6194 reference to the modified object */
6195
Tim Petersced69f82003-09-16 20:30:58 +00006196static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199{
6200
6201 PyUnicodeObject *u;
6202
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006203 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006206
6207 Py_UNICODE_COPY(u->str, self->str, self->length);
6208
Tim Peters7a29bd52001-09-12 03:03:31 +00006209 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 /* fixfct should return TRUE if it modified the buffer. If
6211 FALSE, return a reference to the original buffer instead
6212 (to save space, not time) */
6213 Py_INCREF(self);
6214 Py_DECREF(u);
6215 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 }
6217 return (PyObject*) u;
6218}
6219
Tim Petersced69f82003-09-16 20:30:58 +00006220static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221int fixupper(PyUnicodeObject *self)
6222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 Py_UNICODE *s = self->str;
6225 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006229
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 ch = Py_UNICODE_TOUPPER(*s);
6231 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 *s = ch;
6234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 s++;
6236 }
6237
6238 return status;
6239}
6240
Tim Petersced69f82003-09-16 20:30:58 +00006241static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242int fixlower(PyUnicodeObject *self)
6243{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 Py_UNICODE *s = self->str;
6246 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006247
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 ch = Py_UNICODE_TOLOWER(*s);
6252 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 *s = ch;
6255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 s++;
6257 }
6258
6259 return status;
6260}
6261
Tim Petersced69f82003-09-16 20:30:58 +00006262static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263int fixswapcase(PyUnicodeObject *self)
6264{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 Py_UNICODE *s = self->str;
6267 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006268
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 while (len-- > 0) {
6270 if (Py_UNICODE_ISUPPER(*s)) {
6271 *s = Py_UNICODE_TOLOWER(*s);
6272 status = 1;
6273 } else if (Py_UNICODE_ISLOWER(*s)) {
6274 *s = Py_UNICODE_TOUPPER(*s);
6275 status = 1;
6276 }
6277 s++;
6278 }
6279
6280 return status;
6281}
6282
Tim Petersced69f82003-09-16 20:30:58 +00006283static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284int fixcapitalize(PyUnicodeObject *self)
6285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006286 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006287 Py_UNICODE *s = self->str;
6288 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006289
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006290 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006292 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 *s = Py_UNICODE_TOUPPER(*s);
6294 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006296 s++;
6297 while (--len > 0) {
6298 if (Py_UNICODE_ISUPPER(*s)) {
6299 *s = Py_UNICODE_TOLOWER(*s);
6300 status = 1;
6301 }
6302 s++;
6303 }
6304 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305}
6306
6307static
6308int fixtitle(PyUnicodeObject *self)
6309{
6310 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6311 register Py_UNICODE *e;
6312 int previous_is_cased;
6313
6314 /* Shortcut for single character strings */
6315 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6317 if (*p != ch) {
6318 *p = ch;
6319 return 1;
6320 }
6321 else
6322 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Tim Petersced69f82003-09-16 20:30:58 +00006324
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 e = p + PyUnicode_GET_SIZE(self);
6326 previous_is_cased = 0;
6327 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006329
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 if (previous_is_cased)
6331 *p = Py_UNICODE_TOLOWER(ch);
6332 else
6333 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006334
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 if (Py_UNICODE_ISLOWER(ch) ||
6336 Py_UNICODE_ISUPPER(ch) ||
6337 Py_UNICODE_ISTITLE(ch))
6338 previous_is_cased = 1;
6339 else
6340 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 }
6342 return 1;
6343}
6344
Tim Peters8ce9f162004-08-27 01:49:32 +00006345PyObject *
6346PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Skip Montanaro6543b452004-09-16 03:28:13 +00006348 const Py_UNICODE blank = ' ';
6349 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006351 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006352 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6353 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006354 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6355 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006356 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006357 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
Tim Peters05eba1f2004-08-27 21:32:02 +00006359 fseq = PySequence_Fast(seq, "");
6360 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006361 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006362 }
6363
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006364 /* NOTE: the following code can't call back into Python code,
6365 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006366 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006367
Tim Peters05eba1f2004-08-27 21:32:02 +00006368 seqlen = PySequence_Fast_GET_SIZE(fseq);
6369 /* If empty sequence, return u"". */
6370 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6372 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006373 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006374 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006375 /* If singleton sequence with an exact Unicode, return that. */
6376 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 item = items[0];
6378 if (PyUnicode_CheckExact(item)) {
6379 Py_INCREF(item);
6380 res = (PyUnicodeObject *)item;
6381 goto Done;
6382 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006383 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006384 else {
6385 /* Set up sep and seplen */
6386 if (separator == NULL) {
6387 sep = &blank;
6388 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006389 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006390 else {
6391 if (!PyUnicode_Check(separator)) {
6392 PyErr_Format(PyExc_TypeError,
6393 "separator: expected str instance,"
6394 " %.80s found",
6395 Py_TYPE(separator)->tp_name);
6396 goto onError;
6397 }
6398 sep = PyUnicode_AS_UNICODE(separator);
6399 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006400 }
6401 }
6402
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006403 /* There are at least two things to join, or else we have a subclass
6404 * of str in the sequence.
6405 * Do a pre-pass to figure out the total amount of space we'll
6406 * need (sz), and see whether all argument are strings.
6407 */
6408 sz = 0;
6409 for (i = 0; i < seqlen; i++) {
6410 const Py_ssize_t old_sz = sz;
6411 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 if (!PyUnicode_Check(item)) {
6413 PyErr_Format(PyExc_TypeError,
6414 "sequence item %zd: expected str instance,"
6415 " %.80s found",
6416 i, Py_TYPE(item)->tp_name);
6417 goto onError;
6418 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006419 sz += PyUnicode_GET_SIZE(item);
6420 if (i != 0)
6421 sz += seplen;
6422 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6423 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006425 goto onError;
6426 }
6427 }
Tim Petersced69f82003-09-16 20:30:58 +00006428
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006429 res = _PyUnicode_New(sz);
6430 if (res == NULL)
6431 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006432
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006433 /* Catenate everything. */
6434 res_p = PyUnicode_AS_UNICODE(res);
6435 for (i = 0; i < seqlen; ++i) {
6436 Py_ssize_t itemlen;
6437 item = items[i];
6438 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 /* Copy item, and maybe the separator. */
6440 if (i) {
6441 Py_UNICODE_COPY(res_p, sep, seplen);
6442 res_p += seplen;
6443 }
6444 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6445 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006446 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006449 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return (PyObject *)res;
6451
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006453 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006454 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 return NULL;
6456}
6457
Tim Petersced69f82003-09-16 20:30:58 +00006458static
6459PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 Py_ssize_t left,
6461 Py_ssize_t right,
6462 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 PyUnicodeObject *u;
6465
6466 if (left < 0)
6467 left = 0;
6468 if (right < 0)
6469 right = 0;
6470
Tim Peters7a29bd52001-09-12 03:03:31 +00006471 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 Py_INCREF(self);
6473 return self;
6474 }
6475
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006476 if (left > PY_SSIZE_T_MAX - self->length ||
6477 right > PY_SSIZE_T_MAX - (left + self->length)) {
6478 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6479 return NULL;
6480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 u = _PyUnicode_New(left + self->length + right);
6482 if (u) {
6483 if (left)
6484 Py_UNICODE_FILL(u->str, fill, left);
6485 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6486 if (right)
6487 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6488 }
6489
6490 return u;
6491}
6492
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006493PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497 string = PyUnicode_FromObject(string);
6498 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006501 list = stringlib_splitlines(
6502 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6503 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504
6505 Py_DECREF(string);
6506 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507}
6508
Tim Petersced69f82003-09-16 20:30:58 +00006509static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 PyUnicodeObject *substring,
6512 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006515 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006518 return stringlib_split_whitespace(
6519 (PyObject*) self, self->str, self->length, maxcount
6520 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006522 return stringlib_split(
6523 (PyObject*) self, self->str, self->length,
6524 substring->str, substring->length,
6525 maxcount
6526 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
Tim Petersced69f82003-09-16 20:30:58 +00006529static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006530PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 PyUnicodeObject *substring,
6532 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006533{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006534 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006535 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006536
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006537 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006538 return stringlib_rsplit_whitespace(
6539 (PyObject*) self, self->str, self->length, maxcount
6540 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006541
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006542 return stringlib_rsplit(
6543 (PyObject*) self, self->str, self->length,
6544 substring->str, substring->length,
6545 maxcount
6546 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006547}
6548
6549static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 PyUnicodeObject *str1,
6552 PyUnicodeObject *str2,
6553 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
6555 PyUnicodeObject *u;
6556
6557 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006559 else if (maxcount == 0 || self->length == 0)
6560 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561
Thomas Wouters477c8d52006-05-27 19:21:47 +00006562 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006563 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006564 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006565 if (str1->length == 0)
6566 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567 if (str1->length == 1) {
6568 /* replace characters */
6569 Py_UNICODE u1, u2;
6570 if (!findchar(self->str, self->length, str1->str[0]))
6571 goto nothing;
6572 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6573 if (!u)
6574 return NULL;
6575 Py_UNICODE_COPY(u->str, self->str, self->length);
6576 u1 = str1->str[0];
6577 u2 = str2->str[0];
6578 for (i = 0; i < u->length; i++)
6579 if (u->str[i] == u1) {
6580 if (--maxcount < 0)
6581 break;
6582 u->str[i] = u2;
6583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006585 i = stringlib_find(
6586 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588 if (i < 0)
6589 goto nothing;
6590 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6591 if (!u)
6592 return NULL;
6593 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006594
6595 /* change everything in-place, starting with this one */
6596 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6597 i += str1->length;
6598
6599 while ( --maxcount > 0) {
6600 i = stringlib_find(self->str+i, self->length-i,
6601 str1->str, str1->length,
6602 i);
6603 if (i == -1)
6604 break;
6605 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6606 i += str1->length;
6607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006610
6611 Py_ssize_t n, i, j, e;
6612 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 Py_UNICODE *p;
6614
6615 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006616 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6617 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006618 if (n == 0)
6619 goto nothing;
6620 /* new_size = self->length + n * (str2->length - str1->length)); */
6621 delta = (str2->length - str1->length);
6622 if (delta == 0) {
6623 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006625 product = n * (str2->length - str1->length);
6626 if ((product / (str2->length - str1->length)) != n) {
6627 PyErr_SetString(PyExc_OverflowError,
6628 "replace string is too long");
6629 return NULL;
6630 }
6631 new_size = self->length + product;
6632 if (new_size < 0) {
6633 PyErr_SetString(PyExc_OverflowError,
6634 "replace string is too long");
6635 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
6637 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638 u = _PyUnicode_New(new_size);
6639 if (!u)
6640 return NULL;
6641 i = 0;
6642 p = u->str;
6643 e = self->length - str1->length;
6644 if (str1->length > 0) {
6645 while (n-- > 0) {
6646 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006647 j = stringlib_find(self->str+i, self->length-i,
6648 str1->str, str1->length,
6649 i);
6650 if (j == -1)
6651 break;
6652 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006653 /* copy unchanged part [i:j] */
6654 Py_UNICODE_COPY(p, self->str+i, j-i);
6655 p += j - i;
6656 }
6657 /* copy substitution string */
6658 if (str2->length > 0) {
6659 Py_UNICODE_COPY(p, str2->str, str2->length);
6660 p += str2->length;
6661 }
6662 i = j + str1->length;
6663 }
6664 if (i < self->length)
6665 /* copy tail [i:] */
6666 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6667 } else {
6668 /* interleave */
6669 while (n > 0) {
6670 Py_UNICODE_COPY(p, str2->str, str2->length);
6671 p += str2->length;
6672 if (--n <= 0)
6673 break;
6674 *p++ = self->str[i++];
6675 }
6676 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006680
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 /* nothing to replace; return original string (when possible) */
6683 if (PyUnicode_CheckExact(self)) {
6684 Py_INCREF(self);
6685 return (PyObject *) self;
6686 }
6687 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
6690/* --- Unicode Object Methods --------------------------------------------- */
6691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006692PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694\n\
6695Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006696characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
6698static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006699unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 return fixup(self, fixtitle);
6702}
6703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706\n\
6707Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006708have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
6710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006711unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return fixup(self, fixcapitalize);
6714}
6715
6716#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006717PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719\n\
6720Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
6723static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006724unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
6726 PyObject *list;
6727 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006728 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 /* Split into words */
6731 list = split(self, NULL, -1);
6732 if (!list)
6733 return NULL;
6734
6735 /* Capitalize each word */
6736 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6737 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 if (item == NULL)
6740 goto onError;
6741 Py_DECREF(PyList_GET_ITEM(list, i));
6742 PyList_SET_ITEM(list, i, item);
6743 }
6744
6745 /* Join the words to form a new string */
6746 item = PyUnicode_Join(NULL, list);
6747
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 Py_DECREF(list);
6750 return (PyObject *)item;
6751}
6752#endif
6753
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006754/* Argument converter. Coerces to a single unicode character */
6755
6756static int
6757convert_uc(PyObject *obj, void *addr)
6758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006759 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6760 PyObject *uniobj;
6761 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006762
Benjamin Peterson14339b62009-01-31 16:36:08 +00006763 uniobj = PyUnicode_FromObject(obj);
6764 if (uniobj == NULL) {
6765 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006767 return 0;
6768 }
6769 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6770 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006772 Py_DECREF(uniobj);
6773 return 0;
6774 }
6775 unistr = PyUnicode_AS_UNICODE(uniobj);
6776 *fillcharloc = unistr[0];
6777 Py_DECREF(uniobj);
6778 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006779}
6780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006784Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006785done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786
6787static PyObject *
6788unicode_center(PyUnicodeObject *self, PyObject *args)
6789{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006790 Py_ssize_t marg, left;
6791 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006792 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Thomas Woutersde017742006-02-16 19:34:37 +00006794 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 return NULL;
6796
Tim Peters7a29bd52001-09-12 03:03:31 +00006797 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 Py_INCREF(self);
6799 return (PyObject*) self;
6800 }
6801
6802 marg = width - self->length;
6803 left = marg / 2 + (marg & width & 1);
6804
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006805 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
6807
Marc-André Lemburge5034372000-08-08 08:04:29 +00006808#if 0
6809
6810/* This code should go into some future Unicode collation support
6811 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006812 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006813
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006814/* speedy UTF-16 code point order comparison */
6815/* gleaned from: */
6816/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6817
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006818static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006819{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006820 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006821 0, 0, 0, 0, 0, 0, 0, 0,
6822 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006823 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006824};
6825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826static int
6827unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6828{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006829 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 Py_UNICODE *s1 = str1->str;
6832 Py_UNICODE *s2 = str2->str;
6833
6834 len1 = str1->length;
6835 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006838 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006839
6840 c1 = *s1++;
6841 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006842
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 if (c1 > (1<<11) * 26)
6844 c1 += utf16Fixup[c1>>11];
6845 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006846 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006848
6849 if (c1 != c2)
6850 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006851
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006852 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
6854
6855 return (len1 < len2) ? -1 : (len1 != len2);
6856}
6857
Marc-André Lemburge5034372000-08-08 08:04:29 +00006858#else
6859
6860static int
6861unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6862{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006863 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006864
6865 Py_UNICODE *s1 = str1->str;
6866 Py_UNICODE *s2 = str2->str;
6867
6868 len1 = str1->length;
6869 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006870
Marc-André Lemburge5034372000-08-08 08:04:29 +00006871 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006872 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006873
Fredrik Lundh45714e92001-06-26 16:39:36 +00006874 c1 = *s1++;
6875 c2 = *s2++;
6876
6877 if (c1 != c2)
6878 return (c1 < c2) ? -1 : 1;
6879
Marc-André Lemburge5034372000-08-08 08:04:29 +00006880 len1--; len2--;
6881 }
6882
6883 return (len1 < len2) ? -1 : (len1 != len2);
6884}
6885
6886#endif
6887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006891 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6892 return unicode_compare((PyUnicodeObject *)left,
6893 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006894 PyErr_Format(PyExc_TypeError,
6895 "Can't compare %.100s and %.100s",
6896 left->ob_type->tp_name,
6897 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 return -1;
6899}
6900
Martin v. Löwis5b222132007-06-10 09:51:05 +00006901int
6902PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6903{
6904 int i;
6905 Py_UNICODE *id;
6906 assert(PyUnicode_Check(uni));
6907 id = PyUnicode_AS_UNICODE(uni);
6908 /* Compare Unicode string and source character set string */
6909 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 if (id[i] != str[i])
6911 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006912 /* This check keeps Python strings that end in '\0' from comparing equal
6913 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006914 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006916 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006918 return 0;
6919}
6920
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006924
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006925PyObject *PyUnicode_RichCompare(PyObject *left,
6926 PyObject *right,
6927 int op)
6928{
6929 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006930
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006931 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6932 PyObject *v;
6933 if (((PyUnicodeObject *) left)->length !=
6934 ((PyUnicodeObject *) right)->length) {
6935 if (op == Py_EQ) {
6936 Py_INCREF(Py_False);
6937 return Py_False;
6938 }
6939 if (op == Py_NE) {
6940 Py_INCREF(Py_True);
6941 return Py_True;
6942 }
6943 }
6944 if (left == right)
6945 result = 0;
6946 else
6947 result = unicode_compare((PyUnicodeObject *)left,
6948 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006949
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006950 /* Convert the return value to a Boolean */
6951 switch (op) {
6952 case Py_EQ:
6953 v = TEST_COND(result == 0);
6954 break;
6955 case Py_NE:
6956 v = TEST_COND(result != 0);
6957 break;
6958 case Py_LE:
6959 v = TEST_COND(result <= 0);
6960 break;
6961 case Py_GE:
6962 v = TEST_COND(result >= 0);
6963 break;
6964 case Py_LT:
6965 v = TEST_COND(result == -1);
6966 break;
6967 case Py_GT:
6968 v = TEST_COND(result == 1);
6969 break;
6970 default:
6971 PyErr_BadArgument();
6972 return NULL;
6973 }
6974 Py_INCREF(v);
6975 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006977
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006978 Py_INCREF(Py_NotImplemented);
6979 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006980}
6981
Guido van Rossum403d68b2000-03-13 15:55:09 +00006982int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006984{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006986 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006987
6988 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006989 sub = PyUnicode_FromObject(element);
6990 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 PyErr_Format(PyExc_TypeError,
6992 "'in <string>' requires string as left operand, not %s",
6993 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006994 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006995 }
6996
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 str = PyUnicode_FromObject(container);
6998 if (!str) {
6999 Py_DECREF(sub);
7000 return -1;
7001 }
7002
7003 result = stringlib_contains_obj(str, sub);
7004
7005 Py_DECREF(str);
7006 Py_DECREF(sub);
7007
Guido van Rossum403d68b2000-03-13 15:55:09 +00007008 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007009}
7010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011/* Concat to string or Unicode object giving a new Unicode object. */
7012
7013PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
7016 PyUnicodeObject *u = NULL, *v = NULL, *w;
7017
7018 /* Coerce the two arguments */
7019 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7020 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7023 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026 /* Shortcuts */
7027 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 Py_DECREF(v);
7029 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 }
7031 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 Py_DECREF(u);
7033 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
7035
7036 /* Concat the two Unicode strings */
7037 w = _PyUnicode_New(u->length + v->length);
7038 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 Py_UNICODE_COPY(w->str, u->str, u->length);
7041 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7042
7043 Py_DECREF(u);
7044 Py_DECREF(v);
7045 return (PyObject *)w;
7046
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 Py_XDECREF(u);
7049 Py_XDECREF(v);
7050 return NULL;
7051}
7052
Walter Dörwald1ab83302007-05-18 17:15:44 +00007053void
7054PyUnicode_Append(PyObject **pleft, PyObject *right)
7055{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007056 PyObject *new;
7057 if (*pleft == NULL)
7058 return;
7059 if (right == NULL || !PyUnicode_Check(*pleft)) {
7060 Py_DECREF(*pleft);
7061 *pleft = NULL;
7062 return;
7063 }
7064 new = PyUnicode_Concat(*pleft, right);
7065 Py_DECREF(*pleft);
7066 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007067}
7068
7069void
7070PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 PyUnicode_Append(pleft, right);
7073 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007074}
7075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007076PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007079Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007080string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
7083static PyObject *
7084unicode_count(PyUnicodeObject *self, PyObject *args)
7085{
7086 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007087 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007088 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 PyObject *result;
7090
Guido van Rossumb8872e62000-05-09 14:14:27 +00007091 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 return NULL;
7094
7095 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007099
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007100 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007101 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007102 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007103 substring->str, substring->length,
7104 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007105 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
7107 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007108
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 return result;
7110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007115Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007116to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007117handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007118a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7119'xmlcharrefreplace' as well as any other name registered with\n\
7120codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007123unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007125 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 char *encoding = NULL;
7127 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007128 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007129
Benjamin Peterson308d6372009-09-18 21:42:35 +00007130 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7131 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007133 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007134 if (v == NULL)
7135 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007136 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007137 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007138 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007139 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007140 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007141 Py_DECREF(v);
7142 return NULL;
7143 }
7144 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007145
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007147 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007148}
7149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152\n\
7153Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007154If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155
7156static PyObject*
7157unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7158{
7159 Py_UNICODE *e;
7160 Py_UNICODE *p;
7161 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007162 Py_UNICODE *qe;
7163 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 PyUnicodeObject *u;
7165 int tabsize = 8;
7166
7167 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
Thomas Wouters7e474022000-07-16 12:04:32 +00007170 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007171 i = 0; /* chars up to and including most recent \n or \r */
7172 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7173 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 for (p = self->str; p < e; p++)
7175 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 if (tabsize > 0) {
7177 incr = tabsize - (j % tabsize); /* cannot overflow */
7178 if (j > PY_SSIZE_T_MAX - incr)
7179 goto overflow1;
7180 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 if (j > PY_SSIZE_T_MAX - 1)
7185 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 j++;
7187 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 if (i > PY_SSIZE_T_MAX - j)
7189 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007191 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 }
7193 }
7194
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007195 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007197
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 /* Second pass: create output string and fill it */
7199 u = _PyUnicode_New(i + j);
7200 if (!u)
7201 return NULL;
7202
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007203 j = 0; /* same as in first pass */
7204 q = u->str; /* next output char */
7205 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206
7207 for (p = self->str; p < e; p++)
7208 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 if (tabsize > 0) {
7210 i = tabsize - (j % tabsize);
7211 j += i;
7212 while (i--) {
7213 if (q >= qe)
7214 goto overflow2;
7215 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007218 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 else {
7220 if (q >= qe)
7221 goto overflow2;
7222 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007223 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 if (*p == '\n' || *p == '\r')
7225 j = 0;
7226 }
7227
7228 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007229
7230 overflow2:
7231 Py_DECREF(u);
7232 overflow1:
7233 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235}
7236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007237PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239\n\
7240Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007241such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242arguments start and end are interpreted as in slice notation.\n\
7243\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
7246static PyObject *
7247unicode_find(PyUnicodeObject *self, PyObject *args)
7248{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007249 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007250 Py_ssize_t start;
7251 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007252 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
Christian Heimes9cd17752007-11-18 19:35:23 +00007254 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
Thomas Wouters477c8d52006-05-27 19:21:47 +00007257 result = stringlib_find_slice(
7258 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7259 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7260 start, end
7261 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007264
Christian Heimes217cfd12007-12-02 14:31:20 +00007265 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266}
7267
7268static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007269unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 if (index < 0 || index >= self->length) {
7272 PyErr_SetString(PyExc_IndexError, "string index out of range");
7273 return NULL;
7274 }
7275
7276 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7277}
7278
Guido van Rossumc2504932007-09-18 19:42:40 +00007279/* Believe it or not, this produces the same value for ASCII strings
7280 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007282unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283{
Guido van Rossumc2504932007-09-18 19:42:40 +00007284 Py_ssize_t len;
7285 Py_UNICODE *p;
7286 long x;
7287
7288 if (self->hash != -1)
7289 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007290 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007291 p = self->str;
7292 x = *p << 7;
7293 while (--len >= 0)
7294 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007295 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007296 if (x == -1)
7297 x = -2;
7298 self->hash = x;
7299 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300}
7301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007302PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007305Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307static PyObject *
7308unicode_index(PyUnicodeObject *self, PyObject *args)
7309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007310 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007311 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007312 Py_ssize_t start;
7313 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
Christian Heimes9cd17752007-11-18 19:35:23 +00007315 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
Thomas Wouters477c8d52006-05-27 19:21:47 +00007318 result = stringlib_find_slice(
7319 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7320 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7321 start, end
7322 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007325
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 if (result < 0) {
7327 PyErr_SetString(PyExc_ValueError, "substring not found");
7328 return NULL;
7329 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007330
Christian Heimes217cfd12007-12-02 14:31:20 +00007331 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332}
7333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007334PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007337Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007338at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
7340static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007341unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
7343 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7344 register const Py_UNICODE *e;
7345 int cased;
7346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 /* Shortcut for single character strings */
7348 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007351 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007352 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 e = p + PyUnicode_GET_SIZE(self);
7356 cased = 0;
7357 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007359
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7361 return PyBool_FromLong(0);
7362 else if (!cased && Py_UNICODE_ISLOWER(ch))
7363 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007365 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366}
7367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007371Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007372at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007375unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376{
7377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7378 register const Py_UNICODE *e;
7379 int cased;
7380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 /* Shortcut for single character strings */
7382 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007385 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007386 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007388
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 e = p + PyUnicode_GET_SIZE(self);
7390 cased = 0;
7391 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007393
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7395 return PyBool_FromLong(0);
7396 else if (!cased && Py_UNICODE_ISUPPER(ch))
7397 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007399 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007402PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007405Return True if S is a titlecased string and there is at least one\n\
7406character in S, i.e. upper- and titlecase characters may only\n\
7407follow uncased characters and lowercase characters only cased ones.\n\
7408Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007411unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412{
7413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7414 register const Py_UNICODE *e;
7415 int cased, previous_is_cased;
7416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 /* Shortcut for single character strings */
7418 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7420 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007422 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007423 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007425
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 e = p + PyUnicode_GET_SIZE(self);
7427 cased = 0;
7428 previous_is_cased = 0;
7429 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007431
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7433 if (previous_is_cased)
7434 return PyBool_FromLong(0);
7435 previous_is_cased = 1;
7436 cased = 1;
7437 }
7438 else if (Py_UNICODE_ISLOWER(ch)) {
7439 if (!previous_is_cased)
7440 return PyBool_FromLong(0);
7441 previous_is_cased = 1;
7442 cased = 1;
7443 }
7444 else
7445 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007447 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448}
7449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007450PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007453Return True if all characters in S are whitespace\n\
7454and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007457unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458{
7459 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7460 register const Py_UNICODE *e;
7461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 /* Shortcut for single character strings */
7463 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 Py_UNICODE_ISSPACE(*p))
7465 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007467 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007468 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007470
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 e = p + PyUnicode_GET_SIZE(self);
7472 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 if (!Py_UNICODE_ISSPACE(*p))
7474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007482Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007483and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007484
7485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007486unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007487{
7488 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7489 register const Py_UNICODE *e;
7490
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007491 /* Shortcut for single character strings */
7492 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 Py_UNICODE_ISALPHA(*p))
7494 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007495
7496 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007497 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007499
7500 e = p + PyUnicode_GET_SIZE(self);
7501 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (!Py_UNICODE_ISALPHA(*p))
7503 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007505 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007506}
7507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007510\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007511Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007512and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007513
7514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007516{
7517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7518 register const Py_UNICODE *e;
7519
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007520 /* Shortcut for single character strings */
7521 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 Py_UNICODE_ISALNUM(*p))
7523 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007524
7525 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007526 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007528
7529 e = p + PyUnicode_GET_SIZE(self);
7530 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (!Py_UNICODE_ISALNUM(*p))
7532 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007534 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007535}
7536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007540Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545{
7546 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7547 register const Py_UNICODE *e;
7548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 /* Shortcut for single character strings */
7550 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 Py_UNICODE_ISDECIMAL(*p))
7552 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007554 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007555 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007557
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 e = p + PyUnicode_GET_SIZE(self);
7559 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (!Py_UNICODE_ISDECIMAL(*p))
7561 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007563 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007569Return True if all characters in S are digits\n\
7570and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007573unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
7575 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7576 register const Py_UNICODE *e;
7577
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 /* Shortcut for single character strings */
7579 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 Py_UNICODE_ISDIGIT(*p))
7581 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007583 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007584 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007586
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 e = p + PyUnicode_GET_SIZE(self);
7588 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (!Py_UNICODE_ISDIGIT(*p))
7590 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007592 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007598Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007602unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603{
7604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7605 register const Py_UNICODE *e;
7606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 /* Shortcut for single character strings */
7608 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 Py_UNICODE_ISNUMERIC(*p))
7610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007612 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007613 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 e = p + PyUnicode_GET_SIZE(self);
7617 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 if (!Py_UNICODE_ISNUMERIC(*p))
7619 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622}
7623
Martin v. Löwis47383402007-08-15 07:32:56 +00007624int
7625PyUnicode_IsIdentifier(PyObject *self)
7626{
7627 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7628 register const Py_UNICODE *e;
7629
7630 /* Special case for empty strings */
7631 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007633
7634 /* PEP 3131 says that the first character must be in
7635 XID_Start and subsequent characters in XID_Continue,
7636 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007637 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007638 letters, digits, underscore). However, given the current
7639 definition of XID_Start and XID_Continue, it is sufficient
7640 to check just for these, except that _ must be allowed
7641 as starting an identifier. */
7642 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7643 return 0;
7644
7645 e = p + PyUnicode_GET_SIZE(self);
7646 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 if (!_PyUnicode_IsXidContinue(*p))
7648 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007649 }
7650 return 1;
7651}
7652
7653PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007655\n\
7656Return True if S is a valid identifier according\n\
7657to the language definition.");
7658
7659static PyObject*
7660unicode_isidentifier(PyObject *self)
7661{
7662 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7663}
7664
Georg Brandl559e5d72008-06-11 18:37:52 +00007665PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007667\n\
7668Return True if all characters in S are considered\n\
7669printable in repr() or S is empty, False otherwise.");
7670
7671static PyObject*
7672unicode_isprintable(PyObject *self)
7673{
7674 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7675 register const Py_UNICODE *e;
7676
7677 /* Shortcut for single character strings */
7678 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7679 Py_RETURN_TRUE;
7680 }
7681
7682 e = p + PyUnicode_GET_SIZE(self);
7683 for (; p < e; p++) {
7684 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7685 Py_RETURN_FALSE;
7686 }
7687 }
7688 Py_RETURN_TRUE;
7689}
7690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007691PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007692 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693\n\
7694Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007695iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
7697static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007698unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007700 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701}
7702
Martin v. Löwis18e16552006-02-15 17:27:45 +00007703static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704unicode_length(PyUnicodeObject *self)
7705{
7706 return self->length;
7707}
7708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007709PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007712Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007713done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
7715static PyObject *
7716unicode_ljust(PyUnicodeObject *self, PyObject *args)
7717{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007718 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007719 Py_UNICODE fillchar = ' ';
7720
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007721 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 return NULL;
7723
Tim Peters7a29bd52001-09-12 03:03:31 +00007724 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 Py_INCREF(self);
7726 return (PyObject*) self;
7727 }
7728
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007729 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730}
7731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007732PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007738unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 return fixup(self, fixlower);
7741}
7742
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007743#define LEFTSTRIP 0
7744#define RIGHTSTRIP 1
7745#define BOTHSTRIP 2
7746
7747/* Arrays indexed by above */
7748static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7749
7750#define STRIPNAME(i) (stripformat[i]+3)
7751
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007752/* externally visible for str.strip(unicode) */
7753PyObject *
7754_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7755{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007756 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7757 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7758 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7759 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7760 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007761
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007763
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 i = 0;
7765 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7767 i++;
7768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007770
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 j = len;
7772 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 do {
7774 j--;
7775 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7776 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007778
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 Py_INCREF(self);
7781 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 }
7783 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007785}
7786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
7788static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007789do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7792 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007793
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 i = 0;
7795 if (striptype != RIGHTSTRIP) {
7796 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7797 i++;
7798 }
7799 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007800
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 j = len;
7802 if (striptype != LEFTSTRIP) {
7803 do {
7804 j--;
7805 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7806 j++;
7807 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808
Benjamin Peterson14339b62009-01-31 16:36:08 +00007809 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7810 Py_INCREF(self);
7811 return (PyObject*)self;
7812 }
7813 else
7814 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815}
7816
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007817
7818static PyObject *
7819do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7820{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7824 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007825
Benjamin Peterson14339b62009-01-31 16:36:08 +00007826 if (sep != NULL && sep != Py_None) {
7827 if (PyUnicode_Check(sep))
7828 return _PyUnicode_XStrip(self, striptype, sep);
7829 else {
7830 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 "%s arg must be None or str",
7832 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 return NULL;
7834 }
7835 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007836
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838}
7839
7840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007841PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843\n\
7844Return a copy of the string S with leading and trailing\n\
7845whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007846If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847
7848static PyObject *
7849unicode_strip(PyUnicodeObject *self, PyObject *args)
7850{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851 if (PyTuple_GET_SIZE(args) == 0)
7852 return do_strip(self, BOTHSTRIP); /* Common case */
7853 else
7854 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855}
7856
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007860\n\
7861Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007862If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007863
7864static PyObject *
7865unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7866{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 if (PyTuple_GET_SIZE(args) == 0)
7868 return do_strip(self, LEFTSTRIP); /* Common case */
7869 else
7870 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007871}
7872
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876\n\
7877Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007878If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007879
7880static PyObject *
7881unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 if (PyTuple_GET_SIZE(args) == 0)
7884 return do_strip(self, RIGHTSTRIP); /* Common case */
7885 else
7886 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007887}
7888
7889
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007891unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892{
7893 PyUnicodeObject *u;
7894 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007895 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007896 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Georg Brandl222de0f2009-04-12 12:01:50 +00007898 if (len < 1) {
7899 Py_INCREF(unicode_empty);
7900 return (PyObject *)unicode_empty;
7901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
Tim Peters7a29bd52001-09-12 03:03:31 +00007903 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 /* no repeat, return original string */
7905 Py_INCREF(str);
7906 return (PyObject*) str;
7907 }
Tim Peters8f422462000-09-09 06:13:41 +00007908
7909 /* ensure # of chars needed doesn't overflow int and # of bytes
7910 * needed doesn't overflow size_t
7911 */
7912 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007913 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007914 PyErr_SetString(PyExc_OverflowError,
7915 "repeated string is too long");
7916 return NULL;
7917 }
7918 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7919 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7920 PyErr_SetString(PyExc_OverflowError,
7921 "repeated string is too long");
7922 return NULL;
7923 }
7924 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 if (!u)
7926 return NULL;
7927
7928 p = u->str;
7929
Georg Brandl222de0f2009-04-12 12:01:50 +00007930 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007931 Py_UNICODE_FILL(p, str->str[0], len);
7932 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007933 Py_ssize_t done = str->length; /* number of characters copied this far */
7934 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007936 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007937 Py_UNICODE_COPY(p+done, p, n);
7938 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 }
7941
7942 return (PyObject*) u;
7943}
7944
7945PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 PyObject *subobj,
7947 PyObject *replobj,
7948 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
7950 PyObject *self;
7951 PyObject *str1;
7952 PyObject *str2;
7953 PyObject *result;
7954
7955 self = PyUnicode_FromObject(obj);
7956 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 str1 = PyUnicode_FromObject(subobj);
7959 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 Py_DECREF(self);
7961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 }
7963 str2 = PyUnicode_FromObject(replobj);
7964 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_DECREF(self);
7966 Py_DECREF(str1);
7967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
Tim Petersced69f82003-09-16 20:30:58 +00007969 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 (PyUnicodeObject *)str1,
7971 (PyUnicodeObject *)str2,
7972 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 Py_DECREF(self);
7974 Py_DECREF(str1);
7975 Py_DECREF(str2);
7976 return result;
7977}
7978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007979PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007980 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981\n\
7982Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007983old replaced by new. If the optional argument count is\n\
7984given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985
7986static PyObject*
7987unicode_replace(PyUnicodeObject *self, PyObject *args)
7988{
7989 PyUnicodeObject *str1;
7990 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 PyObject *result;
7993
Martin v. Löwis18e16552006-02-15 17:27:45 +00007994 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 return NULL;
7996 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7997 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008000 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 Py_DECREF(str1);
8002 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004
8005 result = replace(self, str1, str2, maxcount);
8006
8007 Py_DECREF(str1);
8008 Py_DECREF(str2);
8009 return result;
8010}
8011
8012static
8013PyObject *unicode_repr(PyObject *unicode)
8014{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008015 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008016 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008017 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8018 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8019
8020 /* XXX(nnorwitz): rather than over-allocating, it would be
8021 better to choose a different scheme. Perhaps scan the
8022 first N-chars of the string and allocate based on that size.
8023 */
8024 /* Initial allocation is based on the longest-possible unichr
8025 escape.
8026
8027 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8028 unichr, so in this case it's the longest unichr escape. In
8029 narrow (UTF-16) builds this is five chars per source unichr
8030 since there are two unichrs in the surrogate pair, so in narrow
8031 (UTF-16) builds it's not the longest unichr escape.
8032
8033 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8034 so in the narrow (UTF-16) build case it's the longest unichr
8035 escape.
8036 */
8037
Walter Dörwald1ab83302007-05-18 17:15:44 +00008038 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008040#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008042#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008046 if (repr == NULL)
8047 return NULL;
8048
Walter Dörwald1ab83302007-05-18 17:15:44 +00008049 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008050
8051 /* Add quote */
8052 *p++ = (findchar(s, size, '\'') &&
8053 !findchar(s, size, '"')) ? '"' : '\'';
8054 while (size-- > 0) {
8055 Py_UNICODE ch = *s++;
8056
8057 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008058 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008059 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008060 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008061 continue;
8062 }
8063
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008065 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008066 *p++ = '\\';
8067 *p++ = 't';
8068 }
8069 else if (ch == '\n') {
8070 *p++ = '\\';
8071 *p++ = 'n';
8072 }
8073 else if (ch == '\r') {
8074 *p++ = '\\';
8075 *p++ = 'r';
8076 }
8077
8078 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008079 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008080 *p++ = '\\';
8081 *p++ = 'x';
8082 *p++ = hexdigits[(ch >> 4) & 0x000F];
8083 *p++ = hexdigits[ch & 0x000F];
8084 }
8085
Georg Brandl559e5d72008-06-11 18:37:52 +00008086 /* Copy ASCII characters as-is */
8087 else if (ch < 0x7F) {
8088 *p++ = ch;
8089 }
8090
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008092 else {
8093 Py_UCS4 ucs = ch;
8094
8095#ifndef Py_UNICODE_WIDE
8096 Py_UNICODE ch2 = 0;
8097 /* Get code point from surrogate pair */
8098 if (size > 0) {
8099 ch2 = *s;
8100 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008105 size--;
8106 }
8107 }
8108#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008110 (categories Z* and C* except ASCII space)
8111 */
8112 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8113 /* Map 8-bit characters to '\xhh' */
8114 if (ucs <= 0xff) {
8115 *p++ = '\\';
8116 *p++ = 'x';
8117 *p++ = hexdigits[(ch >> 4) & 0x000F];
8118 *p++ = hexdigits[ch & 0x000F];
8119 }
8120 /* Map 21-bit characters to '\U00xxxxxx' */
8121 else if (ucs >= 0x10000) {
8122 *p++ = '\\';
8123 *p++ = 'U';
8124 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8125 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8126 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8127 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8128 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8129 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8130 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8131 *p++ = hexdigits[ucs & 0x0000000F];
8132 }
8133 /* Map 16-bit characters to '\uxxxx' */
8134 else {
8135 *p++ = '\\';
8136 *p++ = 'u';
8137 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8138 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8139 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8140 *p++ = hexdigits[ucs & 0x000F];
8141 }
8142 }
8143 /* Copy characters as-is */
8144 else {
8145 *p++ = ch;
8146#ifndef Py_UNICODE_WIDE
8147 if (ucs >= 0x10000)
8148 *p++ = ch2;
8149#endif
8150 }
8151 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008152 }
8153 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008154 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008155
8156 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008157 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008158 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159}
8160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008161PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163\n\
8164Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008165such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166arguments start and end are interpreted as in slice notation.\n\
8167\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008168Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
8170static PyObject *
8171unicode_rfind(PyUnicodeObject *self, PyObject *args)
8172{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008173 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008174 Py_ssize_t start;
8175 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008176 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
Christian Heimes9cd17752007-11-18 19:35:23 +00008178 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
Thomas Wouters477c8d52006-05-27 19:21:47 +00008181 result = stringlib_rfind_slice(
8182 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8183 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8184 start, end
8185 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186
8187 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008188
Christian Heimes217cfd12007-12-02 14:31:20 +00008189 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190}
8191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008192PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008195Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
8197static PyObject *
8198unicode_rindex(PyUnicodeObject *self, PyObject *args)
8199{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008201 Py_ssize_t start;
8202 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008203 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
Christian Heimes9cd17752007-11-18 19:35:23 +00008205 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208 result = stringlib_rfind_slice(
8209 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8210 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8211 start, end
8212 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213
8214 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008215
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 if (result < 0) {
8217 PyErr_SetString(PyExc_ValueError, "substring not found");
8218 return NULL;
8219 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008220 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221}
8222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008223PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008226Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008227done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228
8229static PyObject *
8230unicode_rjust(PyUnicodeObject *self, PyObject *args)
8231{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008232 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008233 Py_UNICODE fillchar = ' ';
8234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008235 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 return NULL;
8237
Tim Peters7a29bd52001-09-12 03:03:31 +00008238 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 Py_INCREF(self);
8240 return (PyObject*) self;
8241 }
8242
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008243 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244}
8245
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 PyObject *sep,
8248 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
8250 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 s = PyUnicode_FromObject(s);
8253 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 if (sep != NULL) {
8256 sep = PyUnicode_FromObject(sep);
8257 if (sep == NULL) {
8258 Py_DECREF(s);
8259 return NULL;
8260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
8262
8263 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8264
8265 Py_DECREF(s);
8266 Py_XDECREF(sep);
8267 return result;
8268}
8269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008270PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272\n\
8273Return a list of the words in S, using sep as the\n\
8274delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008275splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008276whitespace string is a separator and empty strings are\n\
8277removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
8279static PyObject*
8280unicode_split(PyUnicodeObject *self, PyObject *args)
8281{
8282 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008283 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284
Martin v. Löwis18e16552006-02-15 17:27:45 +00008285 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 return NULL;
8287
8288 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Thomas Wouters477c8d52006-05-27 19:21:47 +00008296PyObject *
8297PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8298{
8299 PyObject* str_obj;
8300 PyObject* sep_obj;
8301 PyObject* out;
8302
8303 str_obj = PyUnicode_FromObject(str_in);
8304 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008306 sep_obj = PyUnicode_FromObject(sep_in);
8307 if (!sep_obj) {
8308 Py_DECREF(str_obj);
8309 return NULL;
8310 }
8311
8312 out = stringlib_partition(
8313 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8314 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8315 );
8316
8317 Py_DECREF(sep_obj);
8318 Py_DECREF(str_obj);
8319
8320 return out;
8321}
8322
8323
8324PyObject *
8325PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8326{
8327 PyObject* str_obj;
8328 PyObject* sep_obj;
8329 PyObject* out;
8330
8331 str_obj = PyUnicode_FromObject(str_in);
8332 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008334 sep_obj = PyUnicode_FromObject(sep_in);
8335 if (!sep_obj) {
8336 Py_DECREF(str_obj);
8337 return NULL;
8338 }
8339
8340 out = stringlib_rpartition(
8341 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8342 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8343 );
8344
8345 Py_DECREF(sep_obj);
8346 Py_DECREF(str_obj);
8347
8348 return out;
8349}
8350
8351PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008353\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008354Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008356found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008357
8358static PyObject*
8359unicode_partition(PyUnicodeObject *self, PyObject *separator)
8360{
8361 return PyUnicode_Partition((PyObject *)self, separator);
8362}
8363
8364PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008365 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008366\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008367Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008368the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008369separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008370
8371static PyObject*
8372unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8373{
8374 return PyUnicode_RPartition((PyObject *)self, separator);
8375}
8376
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008377PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 PyObject *sep,
8379 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008380{
8381 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008383 s = PyUnicode_FromObject(s);
8384 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 if (sep != NULL) {
8387 sep = PyUnicode_FromObject(sep);
8388 if (sep == NULL) {
8389 Py_DECREF(s);
8390 return NULL;
8391 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008392 }
8393
8394 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8395
8396 Py_DECREF(s);
8397 Py_XDECREF(sep);
8398 return result;
8399}
8400
8401PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008403\n\
8404Return a list of the words in S, using sep as the\n\
8405delimiter string, starting at the end of the string and\n\
8406working to the front. If maxsplit is given, at most maxsplit\n\
8407splits are done. If sep is not specified, any whitespace string\n\
8408is a separator.");
8409
8410static PyObject*
8411unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8412{
8413 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008414 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008415
Martin v. Löwis18e16552006-02-15 17:27:45 +00008416 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008417 return NULL;
8418
8419 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008421 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008423 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008425}
8426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008427PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429\n\
8430Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008431Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008432is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
8434static PyObject*
8435unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8436{
Guido van Rossum86662912000-04-11 15:38:46 +00008437 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438
Guido van Rossum86662912000-04-11 15:38:46 +00008439 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 return NULL;
8441
Guido van Rossum86662912000-04-11 15:38:46 +00008442 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443}
8444
8445static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008446PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447{
Walter Dörwald346737f2007-05-31 10:44:43 +00008448 if (PyUnicode_CheckExact(self)) {
8449 Py_INCREF(self);
8450 return self;
8451 } else
8452 /* Subtype -- return genuine unicode string with the same value. */
8453 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8454 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455}
8456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008457PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459\n\
8460Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008461and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
8463static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008464unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 return fixup(self, fixswapcase);
8467}
8468
Georg Brandlceee0772007-11-27 23:48:05 +00008469PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008471\n\
8472Return a translation table usable for str.translate().\n\
8473If there is only one argument, it must be a dictionary mapping Unicode\n\
8474ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008475Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008476If there are two arguments, they must be strings of equal length, and\n\
8477in the resulting dictionary, each character in x will be mapped to the\n\
8478character at the same position in y. If there is a third argument, it\n\
8479must be a string, whose characters will be mapped to None in the result.");
8480
8481static PyObject*
8482unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8483{
8484 PyObject *x, *y = NULL, *z = NULL;
8485 PyObject *new = NULL, *key, *value;
8486 Py_ssize_t i = 0;
8487 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488
Georg Brandlceee0772007-11-27 23:48:05 +00008489 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8490 return NULL;
8491 new = PyDict_New();
8492 if (!new)
8493 return NULL;
8494 if (y != NULL) {
8495 /* x must be a string too, of equal length */
8496 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8497 if (!PyUnicode_Check(x)) {
8498 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8499 "be a string if there is a second argument");
8500 goto err;
8501 }
8502 if (PyUnicode_GET_SIZE(x) != ylen) {
8503 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8504 "arguments must have equal length");
8505 goto err;
8506 }
8507 /* create entries for translating chars in x to those in y */
8508 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008509 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8510 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008511 if (!key || !value)
8512 goto err;
8513 res = PyDict_SetItem(new, key, value);
8514 Py_DECREF(key);
8515 Py_DECREF(value);
8516 if (res < 0)
8517 goto err;
8518 }
8519 /* create entries for deleting chars in z */
8520 if (z != NULL) {
8521 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008522 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008523 if (!key)
8524 goto err;
8525 res = PyDict_SetItem(new, key, Py_None);
8526 Py_DECREF(key);
8527 if (res < 0)
8528 goto err;
8529 }
8530 }
8531 } else {
8532 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008533 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008534 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8535 "to maketrans it must be a dict");
8536 goto err;
8537 }
8538 /* copy entries into the new dict, converting string keys to int keys */
8539 while (PyDict_Next(x, &i, &key, &value)) {
8540 if (PyUnicode_Check(key)) {
8541 /* convert string keys to integer keys */
8542 PyObject *newkey;
8543 if (PyUnicode_GET_SIZE(key) != 1) {
8544 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8545 "table must be of length 1");
8546 goto err;
8547 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008548 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008549 if (!newkey)
8550 goto err;
8551 res = PyDict_SetItem(new, newkey, value);
8552 Py_DECREF(newkey);
8553 if (res < 0)
8554 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008555 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008556 /* just keep integer keys */
8557 if (PyDict_SetItem(new, key, value) < 0)
8558 goto err;
8559 } else {
8560 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8561 "be strings or integers");
8562 goto err;
8563 }
8564 }
8565 }
8566 return new;
8567 err:
8568 Py_DECREF(new);
8569 return NULL;
8570}
8571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008572PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574\n\
8575Return a copy of the string S, where all characters have been mapped\n\
8576through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008577Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008578Unmapped characters are left untouched. Characters mapped to None\n\
8579are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
8581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008582unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583{
Georg Brandlceee0772007-11-27 23:48:05 +00008584 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585}
8586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008587PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008590Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
8592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008593unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 return fixup(self, fixupper);
8596}
8597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008598PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008601Pad a numeric string S with zeros on the left, to fill a field\n\
8602of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603
8604static PyObject *
8605unicode_zfill(PyUnicodeObject *self, PyObject *args)
8606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008607 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 PyUnicodeObject *u;
8609
Martin v. Löwis18e16552006-02-15 17:27:45 +00008610 Py_ssize_t width;
8611 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 return NULL;
8613
8614 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008615 if (PyUnicode_CheckExact(self)) {
8616 Py_INCREF(self);
8617 return (PyObject*) self;
8618 }
8619 else
8620 return PyUnicode_FromUnicode(
8621 PyUnicode_AS_UNICODE(self),
8622 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
8625
8626 fill = width - self->length;
8627
8628 u = pad(self, fill, 0, '0');
8629
Walter Dörwald068325e2002-04-15 13:36:47 +00008630 if (u == NULL)
8631 return NULL;
8632
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 if (u->str[fill] == '+' || u->str[fill] == '-') {
8634 /* move sign to beginning of string */
8635 u->str[0] = u->str[fill];
8636 u->str[fill] = '0';
8637 }
8638
8639 return (PyObject*) u;
8640}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641
8642#if 0
8643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008644unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Christian Heimes2202f872008-02-06 14:31:34 +00008646 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647}
8648#endif
8649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008650PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008653Return True if S starts with the specified prefix, False otherwise.\n\
8654With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008655With optional end, stop comparing S at that position.\n\
8656prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
8658static PyObject *
8659unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008662 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008664 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008665 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008666 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008668 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8670 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008671 if (PyTuple_Check(subobj)) {
8672 Py_ssize_t i;
8673 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8674 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008676 if (substring == NULL)
8677 return NULL;
8678 result = tailmatch(self, substring, start, end, -1);
8679 Py_DECREF(substring);
8680 if (result) {
8681 Py_RETURN_TRUE;
8682 }
8683 }
8684 /* nothing matched */
8685 Py_RETURN_FALSE;
8686 }
8687 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008690 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008692 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693}
8694
8695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008696PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008699Return True if S ends with the specified suffix, False otherwise.\n\
8700With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008701With optional end, stop comparing S at that position.\n\
8702suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
8704static PyObject *
8705unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008708 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008710 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008711 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008712 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008714 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8716 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008717 if (PyTuple_Check(subobj)) {
8718 Py_ssize_t i;
8719 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8720 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008722 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724 result = tailmatch(self, substring, start, end, +1);
8725 Py_DECREF(substring);
8726 if (result) {
8727 Py_RETURN_TRUE;
8728 }
8729 }
8730 Py_RETURN_FALSE;
8731 }
8732 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008736 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
Eric Smith8c663262007-08-25 02:26:07 +00008741#include "stringlib/string_format.h"
8742
8743PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008745\n\
8746");
8747
Eric Smith4a7d76d2008-05-30 18:10:19 +00008748static PyObject *
8749unicode__format__(PyObject* self, PyObject* args)
8750{
8751 PyObject *format_spec;
8752
8753 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8754 return NULL;
8755
8756 return _PyUnicode_FormatAdvanced(self,
8757 PyUnicode_AS_UNICODE(format_spec),
8758 PyUnicode_GET_SIZE(format_spec));
8759}
8760
Eric Smith8c663262007-08-25 02:26:07 +00008761PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008763\n\
8764");
8765
8766static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008767unicode__sizeof__(PyUnicodeObject *v)
8768{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008769 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8770 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008771}
8772
8773PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008775
8776static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008777unicode_getnewargs(PyUnicodeObject *v)
8778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008779 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008780}
8781
8782
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783static PyMethodDef unicode_methods[] = {
8784
8785 /* Order is according to common usage: often used methods should
8786 appear first, since lookup is done sequentially. */
8787
Benjamin Peterson308d6372009-09-18 21:42:35 +00008788 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008789 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8790 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008791 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008792 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8793 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8794 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8795 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8796 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8797 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8798 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008799 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008800 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8801 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8802 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008803 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008804 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8805 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8806 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008807 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008808 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008809 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008810 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008811 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8812 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8813 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8814 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8815 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8816 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8817 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8818 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8819 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8820 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8821 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8822 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8823 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8824 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008825 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008826 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008827 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008828 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008829 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008830 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8831 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008832 {"maketrans", (PyCFunction) unicode_maketrans,
8833 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008834 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008835#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008836 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837#endif
8838
8839#if 0
8840 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008841 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842#endif
8843
Benjamin Peterson14339b62009-01-31 16:36:08 +00008844 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845 {NULL, NULL}
8846};
8847
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008848static PyObject *
8849unicode_mod(PyObject *v, PyObject *w)
8850{
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 if (!PyUnicode_Check(v)) {
8852 Py_INCREF(Py_NotImplemented);
8853 return Py_NotImplemented;
8854 }
8855 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008856}
8857
8858static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008859 0, /*nb_add*/
8860 0, /*nb_subtract*/
8861 0, /*nb_multiply*/
8862 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008863};
8864
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 (lenfunc) unicode_length, /* sq_length */
8867 PyUnicode_Concat, /* sq_concat */
8868 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8869 (ssizeargfunc) unicode_getitem, /* sq_item */
8870 0, /* sq_slice */
8871 0, /* sq_ass_item */
8872 0, /* sq_ass_slice */
8873 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874};
8875
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008876static PyObject*
8877unicode_subscript(PyUnicodeObject* self, PyObject* item)
8878{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008879 if (PyIndex_Check(item)) {
8880 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008881 if (i == -1 && PyErr_Occurred())
8882 return NULL;
8883 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008884 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008885 return unicode_getitem(self, i);
8886 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008887 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008888 Py_UNICODE* source_buf;
8889 Py_UNICODE* result_buf;
8890 PyObject* result;
8891
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008892 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008894 return NULL;
8895 }
8896
8897 if (slicelength <= 0) {
8898 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008899 } else if (start == 0 && step == 1 && slicelength == self->length &&
8900 PyUnicode_CheckExact(self)) {
8901 Py_INCREF(self);
8902 return (PyObject *)self;
8903 } else if (step == 1) {
8904 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008905 } else {
8906 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008907 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8908 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008909
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 if (result_buf == NULL)
8911 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008912
8913 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8914 result_buf[i] = source_buf[cur];
8915 }
Tim Petersced69f82003-09-16 20:30:58 +00008916
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008917 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008918 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008919 return result;
8920 }
8921 } else {
8922 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8923 return NULL;
8924 }
8925}
8926
8927static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008928 (lenfunc)unicode_length, /* mp_length */
8929 (binaryfunc)unicode_subscript, /* mp_subscript */
8930 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008931};
8932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934/* Helpers for PyUnicode_Format() */
8935
8936static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008939 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 (*p_argidx)++;
8942 if (arglen < 0)
8943 return args;
8944 else
8945 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
8947 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 return NULL;
8950}
8951
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008952/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008954static PyObject *
8955formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008957 char *p;
8958 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008960
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 x = PyFloat_AsDouble(v);
8962 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008963 return NULL;
8964
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008967
Eric Smith0923d1d2009-04-16 20:16:10 +00008968 p = PyOS_double_to_string(x, type, prec,
8969 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008970 if (p == NULL)
8971 return NULL;
8972 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008973 PyMem_Free(p);
8974 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975}
8976
Tim Peters38fd5b62000-09-21 05:43:11 +00008977static PyObject*
8978formatlong(PyObject *val, int flags, int prec, int type)
8979{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008980 char *buf;
8981 int len;
8982 PyObject *str; /* temporary string object. */
8983 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008984
Benjamin Peterson14339b62009-01-31 16:36:08 +00008985 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8986 if (!str)
8987 return NULL;
8988 result = PyUnicode_FromStringAndSize(buf, len);
8989 Py_DECREF(str);
8990 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008991}
8992
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993static int
8994formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008995 size_t buflen,
8996 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008998 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008999 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 if (PyUnicode_GET_SIZE(v) == 1) {
9001 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9002 buf[1] = '\0';
9003 return 1;
9004 }
9005#ifndef Py_UNICODE_WIDE
9006 if (PyUnicode_GET_SIZE(v) == 2) {
9007 /* Decode a valid surrogate pair */
9008 int c0 = PyUnicode_AS_UNICODE(v)[0];
9009 int c1 = PyUnicode_AS_UNICODE(v)[1];
9010 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9011 0xDC00 <= c1 && c1 <= 0xDFFF) {
9012 buf[0] = c0;
9013 buf[1] = c1;
9014 buf[2] = '\0';
9015 return 2;
9016 }
9017 }
9018#endif
9019 goto onError;
9020 }
9021 else {
9022 /* Integer input truncated to a character */
9023 long x;
9024 x = PyLong_AsLong(v);
9025 if (x == -1 && PyErr_Occurred())
9026 goto onError;
9027
9028 if (x < 0 || x > 0x10ffff) {
9029 PyErr_SetString(PyExc_OverflowError,
9030 "%c arg not in range(0x110000)");
9031 return -1;
9032 }
9033
9034#ifndef Py_UNICODE_WIDE
9035 if (x > 0xffff) {
9036 x -= 0x10000;
9037 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9038 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9039 return 2;
9040 }
9041#endif
9042 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009043 buf[1] = '\0';
9044 return 1;
9045 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009046
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009048 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009050 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051}
9052
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009053/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009054 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009055*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009056#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009057
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060{
9061 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009062 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 int args_owned = 0;
9064 PyUnicodeObject *result = NULL;
9065 PyObject *dict = NULL;
9066 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009067
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 PyErr_BadInternalCall();
9070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 }
9072 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009073 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075 fmt = PyUnicode_AS_UNICODE(uformat);
9076 fmtcnt = PyUnicode_GET_SIZE(uformat);
9077
9078 reslen = rescnt = fmtcnt + 100;
9079 result = _PyUnicode_New(reslen);
9080 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 res = PyUnicode_AS_UNICODE(result);
9083
9084 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 arglen = PyTuple_Size(args);
9086 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 }
9088 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 arglen = -1;
9090 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009092 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009093 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095
9096 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 if (*fmt != '%') {
9098 if (--rescnt < 0) {
9099 rescnt = fmtcnt + 100;
9100 reslen += rescnt;
9101 if (_PyUnicode_Resize(&result, reslen) < 0)
9102 goto onError;
9103 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9104 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009107 }
9108 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 /* Got a format specifier */
9110 int flags = 0;
9111 Py_ssize_t width = -1;
9112 int prec = -1;
9113 Py_UNICODE c = '\0';
9114 Py_UNICODE fill;
9115 int isnumok;
9116 PyObject *v = NULL;
9117 PyObject *temp = NULL;
9118 Py_UNICODE *pbuf;
9119 Py_UNICODE sign;
9120 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009121 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 fmt++;
9124 if (*fmt == '(') {
9125 Py_UNICODE *keystart;
9126 Py_ssize_t keylen;
9127 PyObject *key;
9128 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009129
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 if (dict == NULL) {
9131 PyErr_SetString(PyExc_TypeError,
9132 "format requires a mapping");
9133 goto onError;
9134 }
9135 ++fmt;
9136 --fmtcnt;
9137 keystart = fmt;
9138 /* Skip over balanced parentheses */
9139 while (pcount > 0 && --fmtcnt >= 0) {
9140 if (*fmt == ')')
9141 --pcount;
9142 else if (*fmt == '(')
9143 ++pcount;
9144 fmt++;
9145 }
9146 keylen = fmt - keystart - 1;
9147 if (fmtcnt < 0 || pcount > 0) {
9148 PyErr_SetString(PyExc_ValueError,
9149 "incomplete format key");
9150 goto onError;
9151 }
9152#if 0
9153 /* keys are converted to strings using UTF-8 and
9154 then looked up since Python uses strings to hold
9155 variables names etc. in its namespaces and we
9156 wouldn't want to break common idioms. */
9157 key = PyUnicode_EncodeUTF8(keystart,
9158 keylen,
9159 NULL);
9160#else
9161 key = PyUnicode_FromUnicode(keystart, keylen);
9162#endif
9163 if (key == NULL)
9164 goto onError;
9165 if (args_owned) {
9166 Py_DECREF(args);
9167 args_owned = 0;
9168 }
9169 args = PyObject_GetItem(dict, key);
9170 Py_DECREF(key);
9171 if (args == NULL) {
9172 goto onError;
9173 }
9174 args_owned = 1;
9175 arglen = -1;
9176 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 while (--fmtcnt >= 0) {
9179 switch (c = *fmt++) {
9180 case '-': flags |= F_LJUST; continue;
9181 case '+': flags |= F_SIGN; continue;
9182 case ' ': flags |= F_BLANK; continue;
9183 case '#': flags |= F_ALT; continue;
9184 case '0': flags |= F_ZERO; continue;
9185 }
9186 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 if (c == '*') {
9189 v = getnextarg(args, arglen, &argidx);
9190 if (v == NULL)
9191 goto onError;
9192 if (!PyLong_Check(v)) {
9193 PyErr_SetString(PyExc_TypeError,
9194 "* wants int");
9195 goto onError;
9196 }
9197 width = PyLong_AsLong(v);
9198 if (width == -1 && PyErr_Occurred())
9199 goto onError;
9200 if (width < 0) {
9201 flags |= F_LJUST;
9202 width = -width;
9203 }
9204 if (--fmtcnt >= 0)
9205 c = *fmt++;
9206 }
9207 else if (c >= '0' && c <= '9') {
9208 width = c - '0';
9209 while (--fmtcnt >= 0) {
9210 c = *fmt++;
9211 if (c < '0' || c > '9')
9212 break;
9213 if ((width*10) / 10 != width) {
9214 PyErr_SetString(PyExc_ValueError,
9215 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009216 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 }
9218 width = width*10 + (c - '0');
9219 }
9220 }
9221 if (c == '.') {
9222 prec = 0;
9223 if (--fmtcnt >= 0)
9224 c = *fmt++;
9225 if (c == '*') {
9226 v = getnextarg(args, arglen, &argidx);
9227 if (v == NULL)
9228 goto onError;
9229 if (!PyLong_Check(v)) {
9230 PyErr_SetString(PyExc_TypeError,
9231 "* wants int");
9232 goto onError;
9233 }
9234 prec = PyLong_AsLong(v);
9235 if (prec == -1 && PyErr_Occurred())
9236 goto onError;
9237 if (prec < 0)
9238 prec = 0;
9239 if (--fmtcnt >= 0)
9240 c = *fmt++;
9241 }
9242 else if (c >= '0' && c <= '9') {
9243 prec = c - '0';
9244 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009245 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 if (c < '0' || c > '9')
9247 break;
9248 if ((prec*10) / 10 != prec) {
9249 PyErr_SetString(PyExc_ValueError,
9250 "prec too big");
9251 goto onError;
9252 }
9253 prec = prec*10 + (c - '0');
9254 }
9255 }
9256 } /* prec */
9257 if (fmtcnt >= 0) {
9258 if (c == 'h' || c == 'l' || c == 'L') {
9259 if (--fmtcnt >= 0)
9260 c = *fmt++;
9261 }
9262 }
9263 if (fmtcnt < 0) {
9264 PyErr_SetString(PyExc_ValueError,
9265 "incomplete format");
9266 goto onError;
9267 }
9268 if (c != '%') {
9269 v = getnextarg(args, arglen, &argidx);
9270 if (v == NULL)
9271 goto onError;
9272 }
9273 sign = 0;
9274 fill = ' ';
9275 switch (c) {
9276
9277 case '%':
9278 pbuf = formatbuf;
9279 /* presume that buffer length is at least 1 */
9280 pbuf[0] = '%';
9281 len = 1;
9282 break;
9283
9284 case 's':
9285 case 'r':
9286 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009287 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 temp = v;
9289 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009290 }
9291 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 if (c == 's')
9293 temp = PyObject_Str(v);
9294 else if (c == 'r')
9295 temp = PyObject_Repr(v);
9296 else
9297 temp = PyObject_ASCII(v);
9298 if (temp == NULL)
9299 goto onError;
9300 if (PyUnicode_Check(temp))
9301 /* nothing to do */;
9302 else {
9303 Py_DECREF(temp);
9304 PyErr_SetString(PyExc_TypeError,
9305 "%s argument has non-string str()");
9306 goto onError;
9307 }
9308 }
9309 pbuf = PyUnicode_AS_UNICODE(temp);
9310 len = PyUnicode_GET_SIZE(temp);
9311 if (prec >= 0 && len > prec)
9312 len = prec;
9313 break;
9314
9315 case 'i':
9316 case 'd':
9317 case 'u':
9318 case 'o':
9319 case 'x':
9320 case 'X':
9321 if (c == 'i')
9322 c = 'd';
9323 isnumok = 0;
9324 if (PyNumber_Check(v)) {
9325 PyObject *iobj=NULL;
9326
9327 if (PyLong_Check(v)) {
9328 iobj = v;
9329 Py_INCREF(iobj);
9330 }
9331 else {
9332 iobj = PyNumber_Long(v);
9333 }
9334 if (iobj!=NULL) {
9335 if (PyLong_Check(iobj)) {
9336 isnumok = 1;
9337 temp = formatlong(iobj, flags, prec, c);
9338 Py_DECREF(iobj);
9339 if (!temp)
9340 goto onError;
9341 pbuf = PyUnicode_AS_UNICODE(temp);
9342 len = PyUnicode_GET_SIZE(temp);
9343 sign = 1;
9344 }
9345 else {
9346 Py_DECREF(iobj);
9347 }
9348 }
9349 }
9350 if (!isnumok) {
9351 PyErr_Format(PyExc_TypeError,
9352 "%%%c format: a number is required, "
9353 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9354 goto onError;
9355 }
9356 if (flags & F_ZERO)
9357 fill = '0';
9358 break;
9359
9360 case 'e':
9361 case 'E':
9362 case 'f':
9363 case 'F':
9364 case 'g':
9365 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009366 temp = formatfloat(v, flags, prec, c);
9367 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009369 pbuf = PyUnicode_AS_UNICODE(temp);
9370 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 sign = 1;
9372 if (flags & F_ZERO)
9373 fill = '0';
9374 break;
9375
9376 case 'c':
9377 pbuf = formatbuf;
9378 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9379 if (len < 0)
9380 goto onError;
9381 break;
9382
9383 default:
9384 PyErr_Format(PyExc_ValueError,
9385 "unsupported format character '%c' (0x%x) "
9386 "at index %zd",
9387 (31<=c && c<=126) ? (char)c : '?',
9388 (int)c,
9389 (Py_ssize_t)(fmt - 1 -
9390 PyUnicode_AS_UNICODE(uformat)));
9391 goto onError;
9392 }
9393 if (sign) {
9394 if (*pbuf == '-' || *pbuf == '+') {
9395 sign = *pbuf++;
9396 len--;
9397 }
9398 else if (flags & F_SIGN)
9399 sign = '+';
9400 else if (flags & F_BLANK)
9401 sign = ' ';
9402 else
9403 sign = 0;
9404 }
9405 if (width < len)
9406 width = len;
9407 if (rescnt - (sign != 0) < width) {
9408 reslen -= rescnt;
9409 rescnt = width + fmtcnt + 100;
9410 reslen += rescnt;
9411 if (reslen < 0) {
9412 Py_XDECREF(temp);
9413 PyErr_NoMemory();
9414 goto onError;
9415 }
9416 if (_PyUnicode_Resize(&result, reslen) < 0) {
9417 Py_XDECREF(temp);
9418 goto onError;
9419 }
9420 res = PyUnicode_AS_UNICODE(result)
9421 + reslen - rescnt;
9422 }
9423 if (sign) {
9424 if (fill != ' ')
9425 *res++ = sign;
9426 rescnt--;
9427 if (width > len)
9428 width--;
9429 }
9430 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9431 assert(pbuf[0] == '0');
9432 assert(pbuf[1] == c);
9433 if (fill != ' ') {
9434 *res++ = *pbuf++;
9435 *res++ = *pbuf++;
9436 }
9437 rescnt -= 2;
9438 width -= 2;
9439 if (width < 0)
9440 width = 0;
9441 len -= 2;
9442 }
9443 if (width > len && !(flags & F_LJUST)) {
9444 do {
9445 --rescnt;
9446 *res++ = fill;
9447 } while (--width > len);
9448 }
9449 if (fill == ' ') {
9450 if (sign)
9451 *res++ = sign;
9452 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9453 assert(pbuf[0] == '0');
9454 assert(pbuf[1] == c);
9455 *res++ = *pbuf++;
9456 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009457 }
9458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 Py_UNICODE_COPY(res, pbuf, len);
9460 res += len;
9461 rescnt -= len;
9462 while (--width >= len) {
9463 --rescnt;
9464 *res++ = ' ';
9465 }
9466 if (dict && (argidx < arglen) && c != '%') {
9467 PyErr_SetString(PyExc_TypeError,
9468 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009469 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 goto onError;
9471 }
9472 Py_XDECREF(temp);
9473 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 } /* until end */
9475 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 PyErr_SetString(PyExc_TypeError,
9477 "not all arguments converted during string formatting");
9478 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 }
9480
Thomas Woutersa96affe2006-03-12 00:29:36 +00009481 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 }
9486 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 return (PyObject *)result;
9488
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 Py_XDECREF(result);
9491 Py_DECREF(uformat);
9492 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 }
9495 return NULL;
9496}
9497
Jeremy Hylton938ace62002-07-17 16:30:39 +00009498static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009499unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9500
Tim Peters6d6c1a32001-08-02 04:15:00 +00009501static PyObject *
9502unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9503{
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009505 static char *kwlist[] = {"object", "encoding", "errors", 0};
9506 char *encoding = NULL;
9507 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009508
Benjamin Peterson14339b62009-01-31 16:36:08 +00009509 if (type != &PyUnicode_Type)
9510 return unicode_subtype_new(type, args, kwds);
9511 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009513 return NULL;
9514 if (x == NULL)
9515 return (PyObject *)_PyUnicode_New(0);
9516 if (encoding == NULL && errors == NULL)
9517 return PyObject_Str(x);
9518 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009520}
9521
Guido van Rossume023fe02001-08-30 03:12:59 +00009522static PyObject *
9523unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9524{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009525 PyUnicodeObject *tmp, *pnew;
9526 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009527
Benjamin Peterson14339b62009-01-31 16:36:08 +00009528 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9529 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9530 if (tmp == NULL)
9531 return NULL;
9532 assert(PyUnicode_Check(tmp));
9533 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9534 if (pnew == NULL) {
9535 Py_DECREF(tmp);
9536 return NULL;
9537 }
9538 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9539 if (pnew->str == NULL) {
9540 _Py_ForgetReference((PyObject *)pnew);
9541 PyObject_Del(pnew);
9542 Py_DECREF(tmp);
9543 return PyErr_NoMemory();
9544 }
9545 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9546 pnew->length = n;
9547 pnew->hash = tmp->hash;
9548 Py_DECREF(tmp);
9549 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009550}
9551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009552PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009554\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009555Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009556encoding defaults to the current default string encoding.\n\
9557errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009558
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009559static PyObject *unicode_iter(PyObject *seq);
9560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009562 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009563 "str", /* tp_name */
9564 sizeof(PyUnicodeObject), /* tp_size */
9565 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009567 (destructor)unicode_dealloc, /* tp_dealloc */
9568 0, /* tp_print */
9569 0, /* tp_getattr */
9570 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009571 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 unicode_repr, /* tp_repr */
9573 &unicode_as_number, /* tp_as_number */
9574 &unicode_as_sequence, /* tp_as_sequence */
9575 &unicode_as_mapping, /* tp_as_mapping */
9576 (hashfunc) unicode_hash, /* tp_hash*/
9577 0, /* tp_call*/
9578 (reprfunc) unicode_str, /* tp_str */
9579 PyObject_GenericGetAttr, /* tp_getattro */
9580 0, /* tp_setattro */
9581 0, /* tp_as_buffer */
9582 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 unicode_doc, /* tp_doc */
9585 0, /* tp_traverse */
9586 0, /* tp_clear */
9587 PyUnicode_RichCompare, /* tp_richcompare */
9588 0, /* tp_weaklistoffset */
9589 unicode_iter, /* tp_iter */
9590 0, /* tp_iternext */
9591 unicode_methods, /* tp_methods */
9592 0, /* tp_members */
9593 0, /* tp_getset */
9594 &PyBaseObject_Type, /* tp_base */
9595 0, /* tp_dict */
9596 0, /* tp_descr_get */
9597 0, /* tp_descr_set */
9598 0, /* tp_dictoffset */
9599 0, /* tp_init */
9600 0, /* tp_alloc */
9601 unicode_new, /* tp_new */
9602 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603};
9604
9605/* Initialize the Unicode implementation */
9606
Thomas Wouters78890102000-07-22 19:25:51 +00009607void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009609 int i;
9610
Thomas Wouters477c8d52006-05-27 19:21:47 +00009611 /* XXX - move this array to unicodectype.c ? */
9612 Py_UNICODE linebreak[] = {
9613 0x000A, /* LINE FEED */
9614 0x000D, /* CARRIAGE RETURN */
9615 0x001C, /* FILE SEPARATOR */
9616 0x001D, /* GROUP SEPARATOR */
9617 0x001E, /* RECORD SEPARATOR */
9618 0x0085, /* NEXT LINE */
9619 0x2028, /* LINE SEPARATOR */
9620 0x2029, /* PARAGRAPH SEPARATOR */
9621 };
9622
Fred Drakee4315f52000-05-09 19:53:39 +00009623 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009624 free_list = NULL;
9625 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009627 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009629
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009630 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009631 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009632 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009634
9635 /* initialize the linebreak bloom filter */
9636 bloom_linebreak = make_bloom_mask(
9637 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9638 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009639
9640 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
9643/* Finalize the Unicode implementation */
9644
Christian Heimesa156e092008-02-16 07:38:31 +00009645int
9646PyUnicode_ClearFreeList(void)
9647{
9648 int freelist_size = numfree;
9649 PyUnicodeObject *u;
9650
9651 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 PyUnicodeObject *v = u;
9653 u = *(PyUnicodeObject **)u;
9654 if (v->str)
9655 PyObject_DEL(v->str);
9656 Py_XDECREF(v->defenc);
9657 PyObject_Del(v);
9658 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009659 }
9660 free_list = NULL;
9661 assert(numfree == 0);
9662 return freelist_size;
9663}
9664
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665void
Thomas Wouters78890102000-07-22 19:25:51 +00009666_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009668 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009670 Py_XDECREF(unicode_empty);
9671 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009672
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009673 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 if (unicode_latin1[i]) {
9675 Py_DECREF(unicode_latin1[i]);
9676 unicode_latin1[i] = NULL;
9677 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009678 }
Christian Heimesa156e092008-02-16 07:38:31 +00009679 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009681
Walter Dörwald16807132007-05-25 13:52:07 +00009682void
9683PyUnicode_InternInPlace(PyObject **p)
9684{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9686 PyObject *t;
9687 if (s == NULL || !PyUnicode_Check(s))
9688 Py_FatalError(
9689 "PyUnicode_InternInPlace: unicode strings only please!");
9690 /* If it's a subclass, we don't really know what putting
9691 it in the interned dict might do. */
9692 if (!PyUnicode_CheckExact(s))
9693 return;
9694 if (PyUnicode_CHECK_INTERNED(s))
9695 return;
9696 if (interned == NULL) {
9697 interned = PyDict_New();
9698 if (interned == NULL) {
9699 PyErr_Clear(); /* Don't leave an exception */
9700 return;
9701 }
9702 }
9703 /* It might be that the GetItem call fails even
9704 though the key is present in the dictionary,
9705 namely when this happens during a stack overflow. */
9706 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009709
Benjamin Peterson29060642009-01-31 22:14:21 +00009710 if (t) {
9711 Py_INCREF(t);
9712 Py_DECREF(*p);
9713 *p = t;
9714 return;
9715 }
Walter Dörwald16807132007-05-25 13:52:07 +00009716
Benjamin Peterson14339b62009-01-31 16:36:08 +00009717 PyThreadState_GET()->recursion_critical = 1;
9718 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9719 PyErr_Clear();
9720 PyThreadState_GET()->recursion_critical = 0;
9721 return;
9722 }
9723 PyThreadState_GET()->recursion_critical = 0;
9724 /* The two references in interned are not counted by refcnt.
9725 The deallocator will take care of this */
9726 Py_REFCNT(s) -= 2;
9727 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009728}
9729
9730void
9731PyUnicode_InternImmortal(PyObject **p)
9732{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 PyUnicode_InternInPlace(p);
9734 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9735 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9736 Py_INCREF(*p);
9737 }
Walter Dörwald16807132007-05-25 13:52:07 +00009738}
9739
9740PyObject *
9741PyUnicode_InternFromString(const char *cp)
9742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 PyObject *s = PyUnicode_FromString(cp);
9744 if (s == NULL)
9745 return NULL;
9746 PyUnicode_InternInPlace(&s);
9747 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009748}
9749
9750void _Py_ReleaseInternedUnicodeStrings(void)
9751{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009752 PyObject *keys;
9753 PyUnicodeObject *s;
9754 Py_ssize_t i, n;
9755 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009756
Benjamin Peterson14339b62009-01-31 16:36:08 +00009757 if (interned == NULL || !PyDict_Check(interned))
9758 return;
9759 keys = PyDict_Keys(interned);
9760 if (keys == NULL || !PyList_Check(keys)) {
9761 PyErr_Clear();
9762 return;
9763 }
Walter Dörwald16807132007-05-25 13:52:07 +00009764
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9766 detector, interned unicode strings are not forcibly deallocated;
9767 rather, we give them their stolen references back, and then clear
9768 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009769
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 n = PyList_GET_SIZE(keys);
9771 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773 for (i = 0; i < n; i++) {
9774 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9775 switch (s->state) {
9776 case SSTATE_NOT_INTERNED:
9777 /* XXX Shouldn't happen */
9778 break;
9779 case SSTATE_INTERNED_IMMORTAL:
9780 Py_REFCNT(s) += 1;
9781 immortal_size += s->length;
9782 break;
9783 case SSTATE_INTERNED_MORTAL:
9784 Py_REFCNT(s) += 2;
9785 mortal_size += s->length;
9786 break;
9787 default:
9788 Py_FatalError("Inconsistent interned string state.");
9789 }
9790 s->state = SSTATE_NOT_INTERNED;
9791 }
9792 fprintf(stderr, "total size of all interned strings: "
9793 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9794 "mortal/immortal\n", mortal_size, immortal_size);
9795 Py_DECREF(keys);
9796 PyDict_Clear(interned);
9797 Py_DECREF(interned);
9798 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009799}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009800
9801
9802/********************* Unicode Iterator **************************/
9803
9804typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009805 PyObject_HEAD
9806 Py_ssize_t it_index;
9807 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009808} unicodeiterobject;
9809
9810static void
9811unicodeiter_dealloc(unicodeiterobject *it)
9812{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 _PyObject_GC_UNTRACK(it);
9814 Py_XDECREF(it->it_seq);
9815 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009816}
9817
9818static int
9819unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9820{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 Py_VISIT(it->it_seq);
9822 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009823}
9824
9825static PyObject *
9826unicodeiter_next(unicodeiterobject *it)
9827{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 PyUnicodeObject *seq;
9829 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009830
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 assert(it != NULL);
9832 seq = it->it_seq;
9833 if (seq == NULL)
9834 return NULL;
9835 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009836
Benjamin Peterson14339b62009-01-31 16:36:08 +00009837 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9838 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009839 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009840 if (item != NULL)
9841 ++it->it_index;
9842 return item;
9843 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009844
Benjamin Peterson14339b62009-01-31 16:36:08 +00009845 Py_DECREF(seq);
9846 it->it_seq = NULL;
9847 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009848}
9849
9850static PyObject *
9851unicodeiter_len(unicodeiterobject *it)
9852{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 Py_ssize_t len = 0;
9854 if (it->it_seq)
9855 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9856 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009857}
9858
9859PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9860
9861static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009862 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009863 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009864 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009865};
9866
9867PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009868 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9869 "str_iterator", /* tp_name */
9870 sizeof(unicodeiterobject), /* tp_basicsize */
9871 0, /* tp_itemsize */
9872 /* methods */
9873 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9874 0, /* tp_print */
9875 0, /* tp_getattr */
9876 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009877 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009878 0, /* tp_repr */
9879 0, /* tp_as_number */
9880 0, /* tp_as_sequence */
9881 0, /* tp_as_mapping */
9882 0, /* tp_hash */
9883 0, /* tp_call */
9884 0, /* tp_str */
9885 PyObject_GenericGetAttr, /* tp_getattro */
9886 0, /* tp_setattro */
9887 0, /* tp_as_buffer */
9888 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9889 0, /* tp_doc */
9890 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9891 0, /* tp_clear */
9892 0, /* tp_richcompare */
9893 0, /* tp_weaklistoffset */
9894 PyObject_SelfIter, /* tp_iter */
9895 (iternextfunc)unicodeiter_next, /* tp_iternext */
9896 unicodeiter_methods, /* tp_methods */
9897 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009898};
9899
9900static PyObject *
9901unicode_iter(PyObject *seq)
9902{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009904
Benjamin Peterson14339b62009-01-31 16:36:08 +00009905 if (!PyUnicode_Check(seq)) {
9906 PyErr_BadInternalCall();
9907 return NULL;
9908 }
9909 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9910 if (it == NULL)
9911 return NULL;
9912 it->it_index = 0;
9913 Py_INCREF(seq);
9914 it->it_seq = (PyUnicodeObject *)seq;
9915 _PyObject_GC_TRACK(it);
9916 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009917}
9918
Martin v. Löwis5b222132007-06-10 09:51:05 +00009919size_t
9920Py_UNICODE_strlen(const Py_UNICODE *u)
9921{
9922 int res = 0;
9923 while(*u++)
9924 res++;
9925 return res;
9926}
9927
9928Py_UNICODE*
9929Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9930{
9931 Py_UNICODE *u = s1;
9932 while ((*u++ = *s2++));
9933 return s1;
9934}
9935
9936Py_UNICODE*
9937Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9938{
9939 Py_UNICODE *u = s1;
9940 while ((*u++ = *s2++))
9941 if (n-- == 0)
9942 break;
9943 return s1;
9944}
9945
9946int
9947Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9948{
9949 while (*s1 && *s2 && *s1 == *s2)
9950 s1++, s2++;
9951 if (*s1 && *s2)
9952 return (*s1 < *s2) ? -1 : +1;
9953 if (*s1)
9954 return 1;
9955 if (*s2)
9956 return -1;
9957 return 0;
9958}
9959
9960Py_UNICODE*
9961Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9962{
9963 const Py_UNICODE *p;
9964 for (p = s; *p; p++)
9965 if (*p == c)
9966 return (Py_UNICODE*)p;
9967 return NULL;
9968}
9969
9970
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009971#ifdef __cplusplus
9972}
9973#endif