blob: 4153c25f53d657a123b41ccc10dec4e035519b30 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001297 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1298 1 on success. */
1299static int
1300normalize_encoding(const char *encoding,
1301 char *lower,
1302 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001305 char *l;
1306 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 e = encoding;
1309 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001310 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001311 while (*e) {
1312 if (l == l_end)
1313 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001314 if (ISUPPER(*e)) {
1315 *l++ = TOLOWER(*e++);
1316 }
1317 else if (*e == '_') {
1318 *l++ = '-';
1319 e++;
1320 }
1321 else {
1322 *l++ = *e++;
1323 }
1324 }
1325 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001326 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327}
1328
1329PyObject *PyUnicode_Decode(const char *s,
1330 Py_ssize_t size,
1331 const char *encoding,
1332 const char *errors)
1333{
1334 PyObject *buffer = NULL, *unicode;
1335 Py_buffer info;
1336 char lower[11]; /* Enough for any encoding shortcut */
1337
1338 if (encoding == NULL)
1339 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001340
1341 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001342 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1343 if (strcmp(lower, "utf-8") == 0)
1344 return PyUnicode_DecodeUTF8(s, size, errors);
1345 else if ((strcmp(lower, "latin-1") == 0) ||
1346 (strcmp(lower, "iso-8859-1") == 0))
1347 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001349 else if (strcmp(lower, "mbcs") == 0)
1350 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001351#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001352 else if (strcmp(lower, "ascii") == 0)
1353 return PyUnicode_DecodeASCII(s, size, errors);
1354 else if (strcmp(lower, "utf-16") == 0)
1355 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1356 else if (strcmp(lower, "utf-32") == 0)
1357 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001361 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001362 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001363 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001364 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (buffer == NULL)
1366 goto onError;
1367 unicode = PyCodec_Decode(buffer, encoding, errors);
1368 if (unicode == NULL)
1369 goto onError;
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001372 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 Py_DECREF(unicode);
1375 goto onError;
1376 }
1377 Py_DECREF(buffer);
1378 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_XDECREF(buffer);
1382 return NULL;
1383}
1384
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1386 const char *encoding,
1387 const char *errors)
1388{
1389 PyObject *v;
1390
1391 if (!PyUnicode_Check(unicode)) {
1392 PyErr_BadArgument();
1393 goto onError;
1394 }
1395
1396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001398
1399 /* Decode via the codec registry */
1400 v = PyCodec_Decode(unicode, encoding, errors);
1401 if (v == NULL)
1402 goto onError;
1403 return v;
1404
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001406 return NULL;
1407}
1408
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 if (!PyUnicode_Check(v)) {
1428 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001429 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001430 Py_TYPE(v)->tp_name);
1431 Py_DECREF(v);
1432 goto onError;
1433 }
1434 return v;
1435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001437 return NULL;
1438}
1439
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 Py_ssize_t size,
1442 const char *encoding,
1443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 unicode = PyUnicode_FromUnicode(s, size);
1448 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1451 Py_DECREF(unicode);
1452 return v;
1453}
1454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 return v;
1474
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001476 return NULL;
1477}
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1480{
Victor Stinner313a1202010-06-11 23:56:51 +00001481 if (Py_FileSystemDefaultEncoding) {
1482#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1483 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1484 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL);
1487#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001488 return PyUnicode_AsEncodedString(unicode,
1489 Py_FileSystemDefaultEncoding,
1490 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001491 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001492 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1493 PyUnicode_GET_SIZE(unicode),
1494 "surrogateescape");
1495}
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1498 const char *encoding,
1499 const char *errors)
1500{
1501 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001502 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001503
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 }
Fred Drakee4315f52000-05-09 19:53:39 +00001508
Tim Petersced69f82003-09-16 20:30:58 +00001509 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001511
1512 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001513 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1514 if (strcmp(lower, "utf-8") == 0)
1515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
1518 else if ((strcmp(lower, "latin-1") == 0) ||
1519 (strcmp(lower, "iso-8859-1") == 0))
1520 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001523#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001524 else if (strcmp(lower, "mbcs") == 0)
1525 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1526 PyUnicode_GET_SIZE(unicode),
1527 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001528#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001529 else if (strcmp(lower, "ascii") == 0)
1530 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1531 PyUnicode_GET_SIZE(unicode),
1532 errors);
1533 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 /* During bootstrap, we may need to find the encodings
1535 package, to load the file system encoding, and require the
1536 file system encoding in order to load the encodings
1537 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001538
Victor Stinner59e62db2010-05-15 13:14:32 +00001539 Break out of this dependency by assuming that the path to
1540 the encodings module is ASCII-only. XXX could try wcstombs
1541 instead, if the file system encoding is the locale's
1542 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001543 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001544 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1545 !PyThreadState_GET()->interp->codecs_initialized)
1546 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1547 PyUnicode_GET_SIZE(unicode),
1548 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Encode via the codec registry */
1551 v = PyCodec_Encode(unicode, encoding, errors);
1552 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001553 return NULL;
1554
1555 /* The normal path */
1556 if (PyBytes_Check(v))
1557 return v;
1558
1559 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560 if (PyByteArray_Check(v)) {
1561 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001562 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 PyOS_snprintf(msg, sizeof(msg),
1564 "encoder %s returned buffer instead of bytes",
1565 encoding);
1566 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001567 Py_DECREF(v);
1568 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001569 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001571 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1572 Py_DECREF(v);
1573 return b;
1574 }
1575
1576 PyErr_Format(PyExc_TypeError,
1577 "encoder did not return a bytes object (type=%.400s)",
1578 Py_TYPE(v)->tp_name);
1579 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001580 return NULL;
1581}
1582
1583PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1584 const char *encoding,
1585 const char *errors)
1586{
1587 PyObject *v;
1588
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 goto onError;
1592 }
1593
1594 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001596
1597 /* Encode via the codec registry */
1598 v = PyCodec_Encode(unicode, encoding, errors);
1599 if (v == NULL)
1600 goto onError;
1601 if (!PyUnicode_Check(v)) {
1602 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001603 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001604 Py_TYPE(v)->tp_name);
1605 Py_DECREF(v);
1606 goto onError;
1607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001609
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 return NULL;
1612}
1613
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001614PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001616{
1617 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 if (v)
1619 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001620 if (errors != NULL)
1621 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001622 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001623 PyUnicode_GET_SIZE(unicode),
1624 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001625 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001626 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001627 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001628 return v;
1629}
1630
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001631PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001632PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001633 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001634 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1635}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001636
Christian Heimes5894ba72007-11-04 11:43:14 +00001637PyObject*
1638PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1639{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001640 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1641 can be undefined. If it is case, decode using UTF-8. The following assumes
1642 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1643 bootstrapping process where the codecs aren't ready yet.
1644 */
1645 if (Py_FileSystemDefaultEncoding) {
1646#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001647 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001648 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001651 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001652 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001653 }
1654#endif
1655 return PyUnicode_Decode(s, size,
1656 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001657 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001658 }
1659 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001660 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001661 }
1662}
1663
Martin v. Löwis011e8422009-05-05 04:43:17 +00001664/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001665 system encoding. The addr param must be a PyObject**.
1666 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001667
1668int
1669PyUnicode_FSConverter(PyObject* arg, void* addr)
1670{
1671 PyObject *output = NULL;
1672 Py_ssize_t size;
1673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001674 if (arg == NULL) {
1675 Py_DECREF(*(PyObject**)addr);
1676 return 1;
1677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001679 output = arg;
1680 Py_INCREF(output);
1681 }
1682 else {
1683 arg = PyUnicode_FromObject(arg);
1684 if (!arg)
1685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001687 Py_DECREF(arg);
1688 if (!output)
1689 return 0;
1690 if (!PyBytes_Check(output)) {
1691 Py_DECREF(output);
1692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1693 return 0;
1694 }
1695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001696 size = PyBytes_GET_SIZE(output);
1697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001698 if (size != strlen(data)) {
1699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1700 Py_DECREF(output);
1701 return 0;
1702 }
1703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001705}
1706
1707
Martin v. Löwis5b222132007-06-10 09:51:05 +00001708char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001709_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001710{
Christian Heimesf3863112007-11-22 07:46:41 +00001711 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001712 if (!PyUnicode_Check(unicode)) {
1713 PyErr_BadArgument();
1714 return NULL;
1715 }
Christian Heimesf3863112007-11-22 07:46:41 +00001716 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1717 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001718 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001719 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001720 *psize = PyBytes_GET_SIZE(bytes);
1721 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001722}
1723
1724char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001725_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001726{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001727 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001728}
1729
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1731{
1732 if (!PyUnicode_Check(unicode)) {
1733 PyErr_BadArgument();
1734 goto onError;
1735 }
1736 return PyUnicode_AS_UNICODE(unicode);
1737
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return NULL;
1740}
1741
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
1744 if (!PyUnicode_Check(unicode)) {
1745 PyErr_BadArgument();
1746 goto onError;
1747 }
1748 return PyUnicode_GET_SIZE(unicode);
1749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return -1;
1752}
1753
Thomas Wouters78890102000-07-22 19:25:51 +00001754const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001755{
1756 return unicode_default_encoding;
1757}
1758
1759int PyUnicode_SetDefaultEncoding(const char *encoding)
1760{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001761 if (strcmp(encoding, unicode_default_encoding) != 0) {
1762 PyErr_Format(PyExc_ValueError,
1763 "Can only set default encoding to %s",
1764 unicode_default_encoding);
1765 return -1;
1766 }
Fred Drakee4315f52000-05-09 19:53:39 +00001767 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001768}
1769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770/* error handling callback helper:
1771 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001772 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 and adjust various state variables.
1774 return 0 on success, -1 on error
1775*/
1776
1777static
1778int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 const char *encoding, const char *reason,
1780 const char **input, const char **inend, Py_ssize_t *startinpos,
1781 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1782 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001784 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785
1786 PyObject *restuple = NULL;
1787 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001789 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790 Py_ssize_t requiredsize;
1791 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001793 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 int res = -1;
1796
1797 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 *errorHandler = PyCodec_LookupError(errors);
1799 if (*errorHandler == NULL)
1800 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 }
1802
1803 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1806 if (*exceptionObject == NULL)
1807 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 }
1809 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001810 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1811 goto onError;
1812 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1813 goto onError;
1814 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1815 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 }
1817
1818 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1819 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001822 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 }
1825 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001827
1828 /* Copy back the bytes variables, which might have been modified by the
1829 callback */
1830 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1831 if (!inputobj)
1832 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001833 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001835 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001836 *input = PyBytes_AS_STRING(inputobj);
1837 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001838 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001839 /* we can DECREF safely, as the exception has another reference,
1840 so the object won't go away. */
1841 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001844 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001845 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1847 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849
1850 /* need more space? (at least enough for what we
1851 have+the replacement+the rest of the string (starting
1852 at the new input position), so we won't have to check space
1853 when there are no errors in the rest of the string) */
1854 repptr = PyUnicode_AS_UNICODE(repunicode);
1855 repsize = PyUnicode_GET_SIZE(repunicode);
1856 requiredsize = *outpos + repsize + insize-newpos;
1857 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001858 if (requiredsize<2*outsize)
1859 requiredsize = 2*outsize;
1860 if (_PyUnicode_Resize(output, requiredsize) < 0)
1861 goto onError;
1862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 }
1864 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001865 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 Py_UNICODE_COPY(*outptr, repptr, repsize);
1867 *outptr += repsize;
1868 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 /* we made it! */
1871 res = 0;
1872
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 Py_XDECREF(restuple);
1875 return res;
1876}
1877
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878/* --- UTF-7 Codec -------------------------------------------------------- */
1879
Antoine Pitrou244651a2009-05-04 18:56:13 +00001880/* See RFC2152 for details. We encode conservatively and decode liberally. */
1881
1882/* Three simple macros defining base-64. */
1883
1884/* Is c a base-64 character? */
1885
1886#define IS_BASE64(c) \
1887 (((c) >= 'A' && (c) <= 'Z') || \
1888 ((c) >= 'a' && (c) <= 'z') || \
1889 ((c) >= '0' && (c) <= '9') || \
1890 (c) == '+' || (c) == '/')
1891
1892/* given that c is a base-64 character, what is its base-64 value? */
1893
1894#define FROM_BASE64(c) \
1895 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1896 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1897 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1898 (c) == '+' ? 62 : 63)
1899
1900/* What is the base-64 character of the bottom 6 bits of n? */
1901
1902#define TO_BASE64(n) \
1903 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1904
1905/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1906 * decoded as itself. We are permissive on decoding; the only ASCII
1907 * byte not decoding to itself is the + which begins a base64
1908 * string. */
1909
1910#define DECODE_DIRECT(c) \
1911 ((c) <= 127 && (c) != '+')
1912
1913/* The UTF-7 encoder treats ASCII characters differently according to
1914 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1915 * the above). See RFC2152. This array identifies these different
1916 * sets:
1917 * 0 : "Set D"
1918 * alphanumeric and '(),-./:?
1919 * 1 : "Set O"
1920 * !"#$%&*;<=>@[]^_`{|}
1921 * 2 : "whitespace"
1922 * ht nl cr sp
1923 * 3 : special (must be base64 encoded)
1924 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1925 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926
Tim Petersced69f82003-09-16 20:30:58 +00001927static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001928char utf7_category[128] = {
1929/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1930 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1931/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1932 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1933/* sp ! " # $ % & ' ( ) * + , - . / */
1934 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1935/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1937/* @ A B C D E F G H I J K L M N O */
1938 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1939/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1941/* ` a b c d e f g h i j k l m n o */
1942 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943/* p q r s t u v w x y z { | } ~ del */
1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945};
1946
Antoine Pitrou244651a2009-05-04 18:56:13 +00001947/* ENCODE_DIRECT: this character should be encoded as itself. The
1948 * answer depends on whether we are encoding set O as itself, and also
1949 * on whether we are encoding whitespace as itself. RFC2152 makes it
1950 * clear that the answers to these questions vary between
1951 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001952
Antoine Pitrou244651a2009-05-04 18:56:13 +00001953#define ENCODE_DIRECT(c, directO, directWS) \
1954 ((c) < 128 && (c) > 0 && \
1955 ((utf7_category[(c)] == 0) || \
1956 (directWS && (utf7_category[(c)] == 2)) || \
1957 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001958
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 Py_ssize_t size,
1961 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001963 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1964}
1965
Antoine Pitrou244651a2009-05-04 18:56:13 +00001966/* The decoder. The only state we preserve is our read position,
1967 * i.e. how many characters we have consumed. So if we end in the
1968 * middle of a shift sequence we have to back off the read position
1969 * and the output to the beginning of the sequence, otherwise we lose
1970 * all the shift state (seen bits, number of bits seen, high
1971 * surrogate). */
1972
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001973PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 Py_ssize_t size,
1975 const char *errors,
1976 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001977{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001979 Py_ssize_t startinpos;
1980 Py_ssize_t endinpos;
1981 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001982 const char *e;
1983 PyUnicodeObject *unicode;
1984 Py_UNICODE *p;
1985 const char *errmsg = "";
1986 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001987 Py_UNICODE *shiftOutStart;
1988 unsigned int base64bits = 0;
1989 unsigned long base64buffer = 0;
1990 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001991 PyObject *errorHandler = NULL;
1992 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993
1994 unicode = _PyUnicode_New(size);
1995 if (!unicode)
1996 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001997 if (size == 0) {
1998 if (consumed)
1999 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002000 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002002
2003 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002004 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005 e = s + size;
2006
2007 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002010 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012 if (inShift) { /* in a base-64 section */
2013 if (IS_BASE64(ch)) { /* consume a base-64 character */
2014 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2015 base64bits += 6;
2016 s++;
2017 if (base64bits >= 16) {
2018 /* we have enough bits for a UTF-16 value */
2019 Py_UNICODE outCh = (Py_UNICODE)
2020 (base64buffer >> (base64bits-16));
2021 base64bits -= 16;
2022 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2023 if (surrogate) {
2024 /* expecting a second surrogate */
2025 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2026#ifdef Py_UNICODE_WIDE
2027 *p++ = (((surrogate & 0x3FF)<<10)
2028 | (outCh & 0x3FF)) + 0x10000;
2029#else
2030 *p++ = surrogate;
2031 *p++ = outCh;
2032#endif
2033 surrogate = 0;
2034 }
2035 else {
2036 surrogate = 0;
2037 errmsg = "second surrogate missing";
2038 goto utf7Error;
2039 }
2040 }
2041 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2042 /* first surrogate */
2043 surrogate = outCh;
2044 }
2045 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2046 errmsg = "unexpected second surrogate";
2047 goto utf7Error;
2048 }
2049 else {
2050 *p++ = outCh;
2051 }
2052 }
2053 }
2054 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 inShift = 0;
2056 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002057 if (surrogate) {
2058 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002059 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 if (base64bits > 0) { /* left-over bits */
2062 if (base64bits >= 6) {
2063 /* We've seen at least one base-64 character */
2064 errmsg = "partial character in shift sequence";
2065 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067 else {
2068 /* Some bits remain; they should be zero */
2069 if (base64buffer != 0) {
2070 errmsg = "non-zero padding bits in shift sequence";
2071 goto utf7Error;
2072 }
2073 }
2074 }
2075 if (ch != '-') {
2076 /* '-' is absorbed; other terminating
2077 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 *p++ = ch;
2079 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 }
2081 }
2082 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002084 s++; /* consume '+' */
2085 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002086 s++;
2087 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002088 }
2089 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002091 shiftOutStart = p;
2092 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002093 }
2094 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002096 *p++ = ch;
2097 s++;
2098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 else {
2100 startinpos = s-starts;
2101 s++;
2102 errmsg = "unexpected special character";
2103 goto utf7Error;
2104 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002105 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002106utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 outpos = p-PyUnicode_AS_UNICODE(unicode);
2108 endinpos = s-starts;
2109 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 errors, &errorHandler,
2111 "utf7", errmsg,
2112 &starts, &e, &startinpos, &endinpos, &exc, &s,
2113 &unicode, &outpos, &p))
2114 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 }
2116
Antoine Pitrou244651a2009-05-04 18:56:13 +00002117 /* end of string */
2118
2119 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2120 /* if we're in an inconsistent state, that's an error */
2121 if (surrogate ||
2122 (base64bits >= 6) ||
2123 (base64bits > 0 && base64buffer != 0)) {
2124 outpos = p-PyUnicode_AS_UNICODE(unicode);
2125 endinpos = size;
2126 if (unicode_decode_call_errorhandler(
2127 errors, &errorHandler,
2128 "utf7", "unterminated shift sequence",
2129 &starts, &e, &startinpos, &endinpos, &exc, &s,
2130 &unicode, &outpos, &p))
2131 goto onError;
2132 if (s < e)
2133 goto restart;
2134 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136
2137 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002138 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 if (inShift) {
2140 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002141 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142 }
2143 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002144 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002145 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002146 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002148 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149 goto onError;
2150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 Py_XDECREF(errorHandler);
2152 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002153 return (PyObject *)unicode;
2154
Benjamin Peterson29060642009-01-31 22:14:21 +00002155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 Py_XDECREF(errorHandler);
2157 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002158 Py_DECREF(unicode);
2159 return NULL;
2160}
2161
2162
2163PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002164 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 int base64SetO,
2166 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002167 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002169 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002170 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002171 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002172 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002174 unsigned int base64bits = 0;
2175 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002176 char * out;
2177 char * start;
2178
2179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002182 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002183 return PyErr_NoMemory();
2184
Antoine Pitrou244651a2009-05-04 18:56:13 +00002185 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 if (v == NULL)
2187 return NULL;
2188
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002189 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190 for (;i < size; ++i) {
2191 Py_UNICODE ch = s[i];
2192
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 if (inShift) {
2194 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2195 /* shifting out */
2196 if (base64bits) { /* output remaining bits */
2197 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2198 base64buffer = 0;
2199 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 }
2201 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202 /* Characters not in the BASE64 set implicitly unshift the sequence
2203 so no '-' is required, except if the character is itself a '-' */
2204 if (IS_BASE64(ch) || ch == '-') {
2205 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207 *out++ = (char) ch;
2208 }
2209 else {
2210 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002211 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002212 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 else { /* not in a shift sequence */
2214 if (ch == '+') {
2215 *out++ = '+';
2216 *out++ = '-';
2217 }
2218 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2219 *out++ = (char) ch;
2220 }
2221 else {
2222 *out++ = '+';
2223 inShift = 1;
2224 goto encode_char;
2225 }
2226 }
2227 continue;
2228encode_char:
2229#ifdef Py_UNICODE_WIDE
2230 if (ch >= 0x10000) {
2231 /* code first surrogate */
2232 base64bits += 16;
2233 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2234 while (base64bits >= 6) {
2235 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2236 base64bits -= 6;
2237 }
2238 /* prepare second surrogate */
2239 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2240 }
2241#endif
2242 base64bits += 16;
2243 base64buffer = (base64buffer << 16) | ch;
2244 while (base64bits >= 6) {
2245 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2246 base64bits -= 6;
2247 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002248 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002249 if (base64bits)
2250 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2251 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002252 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002253 if (_PyBytes_Resize(&v, out - start) < 0)
2254 return NULL;
2255 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256}
2257
Antoine Pitrou244651a2009-05-04 18:56:13 +00002258#undef IS_BASE64
2259#undef FROM_BASE64
2260#undef TO_BASE64
2261#undef DECODE_DIRECT
2262#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264/* --- UTF-8 Codec -------------------------------------------------------- */
2265
Tim Petersced69f82003-09-16 20:30:58 +00002266static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267char utf8_code_length[256] = {
2268 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2269 illegal prefix. see RFC 2279 for details */
2270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2282 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2283 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2284 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2285 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2286};
2287
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 Py_ssize_t size,
2290 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291{
Walter Dörwald69652032004-09-07 20:24:22 +00002292 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2293}
2294
Antoine Pitrouab868312009-01-10 15:40:25 +00002295/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2296#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2297
2298/* Mask to quickly check whether a C 'long' contains a
2299 non-ASCII, UTF8-encoded char. */
2300#if (SIZEOF_LONG == 8)
2301# define ASCII_CHAR_MASK 0x8080808080808080L
2302#elif (SIZEOF_LONG == 4)
2303# define ASCII_CHAR_MASK 0x80808080L
2304#else
2305# error C 'long' size should be either 4 or 8!
2306#endif
2307
Walter Dörwald69652032004-09-07 20:24:22 +00002308PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002309 Py_ssize_t size,
2310 const char *errors,
2311 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002312{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002313 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002315 Py_ssize_t startinpos;
2316 Py_ssize_t endinpos;
2317 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002318 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 PyUnicodeObject *unicode;
2320 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002321 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002322 PyObject *errorHandler = NULL;
2323 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324
2325 /* Note: size will always be longer than the resulting Unicode
2326 character count */
2327 unicode = _PyUnicode_New(size);
2328 if (!unicode)
2329 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002330 if (size == 0) {
2331 if (consumed)
2332 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335
2336 /* Unpack UTF-8 encoded data */
2337 p = unicode->str;
2338 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002339 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340
2341 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002342 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343
2344 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002345 /* Fast path for runs of ASCII characters. Given that common UTF-8
2346 input will consist of an overwhelming majority of ASCII
2347 characters, we try to optimize for this case by checking
2348 as many characters as a C 'long' can contain.
2349 First, check if we can do an aligned read, as most CPUs have
2350 a penalty for unaligned reads.
2351 */
2352 if (!((size_t) s & LONG_PTR_MASK)) {
2353 /* Help register allocation */
2354 register const char *_s = s;
2355 register Py_UNICODE *_p = p;
2356 while (_s < aligned_end) {
2357 /* Read a whole long at a time (either 4 or 8 bytes),
2358 and do a fast unrolled copy if it only contains ASCII
2359 characters. */
2360 unsigned long data = *(unsigned long *) _s;
2361 if (data & ASCII_CHAR_MASK)
2362 break;
2363 _p[0] = (unsigned char) _s[0];
2364 _p[1] = (unsigned char) _s[1];
2365 _p[2] = (unsigned char) _s[2];
2366 _p[3] = (unsigned char) _s[3];
2367#if (SIZEOF_LONG == 8)
2368 _p[4] = (unsigned char) _s[4];
2369 _p[5] = (unsigned char) _s[5];
2370 _p[6] = (unsigned char) _s[6];
2371 _p[7] = (unsigned char) _s[7];
2372#endif
2373 _s += SIZEOF_LONG;
2374 _p += SIZEOF_LONG;
2375 }
2376 s = _s;
2377 p = _p;
2378 if (s == e)
2379 break;
2380 ch = (unsigned char)*s;
2381 }
2382 }
2383
2384 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002385 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 s++;
2387 continue;
2388 }
2389
2390 n = utf8_code_length[ch];
2391
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002392 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002393 if (consumed)
2394 break;
2395 else {
2396 errmsg = "unexpected end of data";
2397 startinpos = s-starts;
2398 endinpos = size;
2399 goto utf8Error;
2400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
2403 switch (n) {
2404
2405 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002406 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 startinpos = s-starts;
2408 endinpos = startinpos+1;
2409 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410
2411 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 startinpos = s-starts;
2414 endinpos = startinpos+1;
2415 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
2417 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002418 if ((s[1] & 0xc0) != 0x80) {
2419 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 startinpos = s-starts;
2421 endinpos = startinpos+2;
2422 goto utf8Error;
2423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002425 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 startinpos = s-starts;
2427 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002428 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 goto utf8Error;
2430 }
2431 else
2432 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433 break;
2434
2435 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002436 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 (s[2] & 0xc0) != 0x80) {
2438 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 startinpos = s-starts;
2440 endinpos = startinpos+3;
2441 goto utf8Error;
2442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002444 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002445 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 startinpos = s-starts;
2447 endinpos = startinpos+3;
2448 goto utf8Error;
2449 }
2450 else
2451 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002452 break;
2453
2454 case 4:
2455 if ((s[1] & 0xc0) != 0x80 ||
2456 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 (s[3] & 0xc0) != 0x80) {
2458 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 startinpos = s-starts;
2460 endinpos = startinpos+4;
2461 goto utf8Error;
2462 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002463 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002465 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002468 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002469 UTF-16 */
2470 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002471 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 startinpos = s-starts;
2473 endinpos = startinpos+4;
2474 goto utf8Error;
2475 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002476#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002478#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002479 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002480
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002481 /* translate from 10000..10FFFF to 0..FFFF */
2482 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002483
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002484 /* high surrogate = top 10 bits added to D800 */
2485 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002486
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002487 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002488 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002489#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 break;
2491
2492 default:
2493 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002494 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 startinpos = s-starts;
2496 endinpos = startinpos+n;
2497 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 }
2499 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002500 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002501
Benjamin Peterson29060642009-01-31 22:14:21 +00002502 utf8Error:
2503 outpos = p-PyUnicode_AS_UNICODE(unicode);
2504 if (unicode_decode_call_errorhandler(
2505 errors, &errorHandler,
2506 "utf8", errmsg,
2507 &starts, &e, &startinpos, &endinpos, &exc, &s,
2508 &unicode, &outpos, &p))
2509 goto onError;
2510 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 }
Walter Dörwald69652032004-09-07 20:24:22 +00002512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514
2515 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002516 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 goto onError;
2518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 Py_XDECREF(errorHandler);
2520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 return (PyObject *)unicode;
2522
Benjamin Peterson29060642009-01-31 22:14:21 +00002523 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 Py_XDECREF(errorHandler);
2525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 Py_DECREF(unicode);
2527 return NULL;
2528}
2529
Antoine Pitrouab868312009-01-10 15:40:25 +00002530#undef ASCII_CHAR_MASK
2531
2532
Tim Peters602f7402002-04-27 18:03:26 +00002533/* Allocation strategy: if the string is short, convert into a stack buffer
2534 and allocate exactly as much space needed at the end. Else allocate the
2535 maximum possible needed (4 result bytes per Unicode character), and return
2536 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002537*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002538PyObject *
2539PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002540 Py_ssize_t size,
2541 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542{
Tim Peters602f7402002-04-27 18:03:26 +00002543#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002544
Guido van Rossum98297ee2007-11-06 21:34:58 +00002545 Py_ssize_t i; /* index into s of next input byte */
2546 PyObject *result; /* result string object */
2547 char *p; /* next free byte in output buffer */
2548 Py_ssize_t nallocated; /* number of result bytes allocated */
2549 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002550 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002551 PyObject *errorHandler = NULL;
2552 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002553
Tim Peters602f7402002-04-27 18:03:26 +00002554 assert(s != NULL);
2555 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
Tim Peters602f7402002-04-27 18:03:26 +00002557 if (size <= MAX_SHORT_UNICHARS) {
2558 /* Write into the stack buffer; nallocated can't overflow.
2559 * At the end, we'll allocate exactly as much heap space as it
2560 * turns out we need.
2561 */
2562 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002563 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002564 p = stackbuf;
2565 }
2566 else {
2567 /* Overallocate on the heap, and give the excess back at the end. */
2568 nallocated = size * 4;
2569 if (nallocated / 4 != size) /* overflow! */
2570 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002571 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002572 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002573 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002574 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002575 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002576
Tim Peters602f7402002-04-27 18:03:26 +00002577 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002578 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002579
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002580 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002581 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002583
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002585 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002586 *p++ = (char)(0xc0 | (ch >> 6));
2587 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002588 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002589#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002590 /* Special case: check for high and low surrogate */
2591 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2592 Py_UCS4 ch2 = s[i];
2593 /* Combine the two surrogates to form a UCS4 value */
2594 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2595 i++;
2596
2597 /* Encode UCS4 Unicode ordinals */
2598 *p++ = (char)(0xf0 | (ch >> 18));
2599 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002600 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2601 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002602 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002603#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002604 Py_ssize_t newpos;
2605 PyObject *rep;
2606 Py_ssize_t repsize, k;
2607 rep = unicode_encode_call_errorhandler
2608 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2609 s, size, &exc, i-1, i, &newpos);
2610 if (!rep)
2611 goto error;
2612
2613 if (PyBytes_Check(rep))
2614 repsize = PyBytes_GET_SIZE(rep);
2615 else
2616 repsize = PyUnicode_GET_SIZE(rep);
2617
2618 if (repsize > 4) {
2619 Py_ssize_t offset;
2620
2621 if (result == NULL)
2622 offset = p - stackbuf;
2623 else
2624 offset = p - PyBytes_AS_STRING(result);
2625
2626 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2627 /* integer overflow */
2628 PyErr_NoMemory();
2629 goto error;
2630 }
2631 nallocated += repsize - 4;
2632 if (result != NULL) {
2633 if (_PyBytes_Resize(&result, nallocated) < 0)
2634 goto error;
2635 } else {
2636 result = PyBytes_FromStringAndSize(NULL, nallocated);
2637 if (result == NULL)
2638 goto error;
2639 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2640 }
2641 p = PyBytes_AS_STRING(result) + offset;
2642 }
2643
2644 if (PyBytes_Check(rep)) {
2645 char *prep = PyBytes_AS_STRING(rep);
2646 for(k = repsize; k > 0; k--)
2647 *p++ = *prep++;
2648 } else /* rep is unicode */ {
2649 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2650 Py_UNICODE c;
2651
2652 for(k=0; k<repsize; k++) {
2653 c = prep[k];
2654 if (0x80 <= c) {
2655 raise_encode_exception(&exc, "utf-8", s, size,
2656 i-1, i, "surrogates not allowed");
2657 goto error;
2658 }
2659 *p++ = (char)prep[k];
2660 }
2661 }
2662 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002663#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002664 }
Victor Stinner445a6232010-04-22 20:01:57 +00002665#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002666 } else if (ch < 0x10000) {
2667 *p++ = (char)(0xe0 | (ch >> 12));
2668 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2669 *p++ = (char)(0x80 | (ch & 0x3f));
2670 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002671 /* Encode UCS4 Unicode ordinals */
2672 *p++ = (char)(0xf0 | (ch >> 18));
2673 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2674 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2675 *p++ = (char)(0x80 | (ch & 0x3f));
2676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002678
Guido van Rossum98297ee2007-11-06 21:34:58 +00002679 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002680 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002681 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002682 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002683 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002684 }
2685 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002686 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002687 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002688 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002689 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002690 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002691 Py_XDECREF(errorHandler);
2692 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002693 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002694 error:
2695 Py_XDECREF(errorHandler);
2696 Py_XDECREF(exc);
2697 Py_XDECREF(result);
2698 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002699
Tim Peters602f7402002-04-27 18:03:26 +00002700#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2704{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2708 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002709 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 PyUnicode_GET_SIZE(unicode),
2711 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712}
2713
Walter Dörwald41980ca2007-08-16 21:55:45 +00002714/* --- UTF-32 Codec ------------------------------------------------------- */
2715
2716PyObject *
2717PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 Py_ssize_t size,
2719 const char *errors,
2720 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721{
2722 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2723}
2724
2725PyObject *
2726PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 Py_ssize_t size,
2728 const char *errors,
2729 int *byteorder,
2730 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002731{
2732 const char *starts = s;
2733 Py_ssize_t startinpos;
2734 Py_ssize_t endinpos;
2735 Py_ssize_t outpos;
2736 PyUnicodeObject *unicode;
2737 Py_UNICODE *p;
2738#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002739 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002740 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002741#else
2742 const int pairs = 0;
2743#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002744 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002745 int bo = 0; /* assume native ordering by default */
2746 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002747 /* Offsets from q for retrieving bytes in the right order. */
2748#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2749 int iorder[] = {0, 1, 2, 3};
2750#else
2751 int iorder[] = {3, 2, 1, 0};
2752#endif
2753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002755
Walter Dörwald41980ca2007-08-16 21:55:45 +00002756 q = (unsigned char *)s;
2757 e = q + size;
2758
2759 if (byteorder)
2760 bo = *byteorder;
2761
2762 /* Check for BOM marks (U+FEFF) in the input and adjust current
2763 byte order setting accordingly. In native mode, the leading BOM
2764 mark is skipped, in all other modes, it is copied to the output
2765 stream as-is (giving a ZWNBSP character). */
2766 if (bo == 0) {
2767 if (size >= 4) {
2768 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002770#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 if (bom == 0x0000FEFF) {
2772 q += 4;
2773 bo = -1;
2774 }
2775 else if (bom == 0xFFFE0000) {
2776 q += 4;
2777 bo = 1;
2778 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 if (bom == 0x0000FEFF) {
2781 q += 4;
2782 bo = 1;
2783 }
2784 else if (bom == 0xFFFE0000) {
2785 q += 4;
2786 bo = -1;
2787 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002788#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790 }
2791
2792 if (bo == -1) {
2793 /* force LE */
2794 iorder[0] = 0;
2795 iorder[1] = 1;
2796 iorder[2] = 2;
2797 iorder[3] = 3;
2798 }
2799 else if (bo == 1) {
2800 /* force BE */
2801 iorder[0] = 3;
2802 iorder[1] = 2;
2803 iorder[2] = 1;
2804 iorder[3] = 0;
2805 }
2806
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002807 /* On narrow builds we split characters outside the BMP into two
2808 codepoints => count how much extra space we need. */
2809#ifndef Py_UNICODE_WIDE
2810 for (qq = q; qq < e; qq += 4)
2811 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2812 pairs++;
2813#endif
2814
2815 /* This might be one to much, because of a BOM */
2816 unicode = _PyUnicode_New((size+3)/4+pairs);
2817 if (!unicode)
2818 return NULL;
2819 if (size == 0)
2820 return (PyObject *)unicode;
2821
2822 /* Unpack UTF-32 encoded data */
2823 p = unicode->str;
2824
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 Py_UCS4 ch;
2827 /* remaining bytes at the end? (size should be divisible by 4) */
2828 if (e-q<4) {
2829 if (consumed)
2830 break;
2831 errmsg = "truncated data";
2832 startinpos = ((const char *)q)-starts;
2833 endinpos = ((const char *)e)-starts;
2834 goto utf32Error;
2835 /* The remaining input chars are ignored if the callback
2836 chooses to skip the input */
2837 }
2838 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2839 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002840
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 if (ch >= 0x110000)
2842 {
2843 errmsg = "codepoint not in range(0x110000)";
2844 startinpos = ((const char *)q)-starts;
2845 endinpos = startinpos+4;
2846 goto utf32Error;
2847 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002848#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 if (ch >= 0x10000)
2850 {
2851 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2852 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2853 }
2854 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002855#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 *p++ = ch;
2857 q += 4;
2858 continue;
2859 utf32Error:
2860 outpos = p-PyUnicode_AS_UNICODE(unicode);
2861 if (unicode_decode_call_errorhandler(
2862 errors, &errorHandler,
2863 "utf32", errmsg,
2864 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2865 &unicode, &outpos, &p))
2866 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002867 }
2868
2869 if (byteorder)
2870 *byteorder = bo;
2871
2872 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002874
2875 /* Adjust length */
2876 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2877 goto onError;
2878
2879 Py_XDECREF(errorHandler);
2880 Py_XDECREF(exc);
2881 return (PyObject *)unicode;
2882
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002884 Py_DECREF(unicode);
2885 Py_XDECREF(errorHandler);
2886 Py_XDECREF(exc);
2887 return NULL;
2888}
2889
2890PyObject *
2891PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 Py_ssize_t size,
2893 const char *errors,
2894 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002896 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002897 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002898 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002900 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002901#else
2902 const int pairs = 0;
2903#endif
2904 /* Offsets from p for storing byte pairs in the right order. */
2905#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2906 int iorder[] = {0, 1, 2, 3};
2907#else
2908 int iorder[] = {3, 2, 1, 0};
2909#endif
2910
Benjamin Peterson29060642009-01-31 22:14:21 +00002911#define STORECHAR(CH) \
2912 do { \
2913 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2914 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2915 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2916 p[iorder[0]] = (CH) & 0xff; \
2917 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918 } while(0)
2919
2920 /* In narrow builds we can output surrogate pairs as one codepoint,
2921 so we need less space. */
2922#ifndef Py_UNICODE_WIDE
2923 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2925 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2926 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002927#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002928 nsize = (size - pairs + (byteorder == 0));
2929 bytesize = nsize * 4;
2930 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002932 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933 if (v == NULL)
2934 return NULL;
2935
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002936 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002937 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002939 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002940 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002941
2942 if (byteorder == -1) {
2943 /* force LE */
2944 iorder[0] = 0;
2945 iorder[1] = 1;
2946 iorder[2] = 2;
2947 iorder[3] = 3;
2948 }
2949 else if (byteorder == 1) {
2950 /* force BE */
2951 iorder[0] = 3;
2952 iorder[1] = 2;
2953 iorder[2] = 1;
2954 iorder[3] = 0;
2955 }
2956
2957 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002959#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2961 Py_UCS4 ch2 = *s;
2962 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2963 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2964 s++;
2965 size--;
2966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002967 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002968#endif
2969 STORECHAR(ch);
2970 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002971
2972 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002973 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002974#undef STORECHAR
2975}
2976
2977PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2978{
2979 if (!PyUnicode_Check(unicode)) {
2980 PyErr_BadArgument();
2981 return NULL;
2982 }
2983 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyUnicode_GET_SIZE(unicode),
2985 NULL,
2986 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002987}
2988
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989/* --- UTF-16 Codec ------------------------------------------------------- */
2990
Tim Peters772747b2001-08-09 22:21:55 +00002991PyObject *
2992PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 Py_ssize_t size,
2994 const char *errors,
2995 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996{
Walter Dörwald69652032004-09-07 20:24:22 +00002997 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2998}
2999
Antoine Pitrouab868312009-01-10 15:40:25 +00003000/* Two masks for fast checking of whether a C 'long' may contain
3001 UTF16-encoded surrogate characters. This is an efficient heuristic,
3002 assuming that non-surrogate characters with a code point >= 0x8000 are
3003 rare in most input.
3004 FAST_CHAR_MASK is used when the input is in native byte ordering,
3005 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003006*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003007#if (SIZEOF_LONG == 8)
3008# define FAST_CHAR_MASK 0x8000800080008000L
3009# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3010#elif (SIZEOF_LONG == 4)
3011# define FAST_CHAR_MASK 0x80008000L
3012# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3013#else
3014# error C 'long' size should be either 4 or 8!
3015#endif
3016
Walter Dörwald69652032004-09-07 20:24:22 +00003017PyObject *
3018PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_ssize_t size,
3020 const char *errors,
3021 int *byteorder,
3022 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t startinpos;
3026 Py_ssize_t endinpos;
3027 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 PyUnicodeObject *unicode;
3029 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003030 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003031 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003032 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003033 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003034 /* Offsets from q for retrieving byte pairs in the right order. */
3035#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3036 int ihi = 1, ilo = 0;
3037#else
3038 int ihi = 0, ilo = 1;
3039#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 PyObject *errorHandler = NULL;
3041 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042
3043 /* Note: size will always be longer than the resulting Unicode
3044 character count */
3045 unicode = _PyUnicode_New(size);
3046 if (!unicode)
3047 return NULL;
3048 if (size == 0)
3049 return (PyObject *)unicode;
3050
3051 /* Unpack UTF-16 encoded data */
3052 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003053 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003054 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055
3056 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003057 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003059 /* Check for BOM marks (U+FEFF) in the input and adjust current
3060 byte order setting accordingly. In native mode, the leading BOM
3061 mark is skipped, in all other modes, it is copied to the output
3062 stream as-is (giving a ZWNBSP character). */
3063 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003064 if (size >= 2) {
3065 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003066#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 if (bom == 0xFEFF) {
3068 q += 2;
3069 bo = -1;
3070 }
3071 else if (bom == 0xFFFE) {
3072 q += 2;
3073 bo = 1;
3074 }
Tim Petersced69f82003-09-16 20:30:58 +00003075#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 if (bom == 0xFEFF) {
3077 q += 2;
3078 bo = 1;
3079 }
3080 else if (bom == 0xFFFE) {
3081 q += 2;
3082 bo = -1;
3083 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003084#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087
Tim Peters772747b2001-08-09 22:21:55 +00003088 if (bo == -1) {
3089 /* force LE */
3090 ihi = 1;
3091 ilo = 0;
3092 }
3093 else if (bo == 1) {
3094 /* force BE */
3095 ihi = 0;
3096 ilo = 1;
3097 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003098#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3099 native_ordering = ilo < ihi;
3100#else
3101 native_ordering = ilo > ihi;
3102#endif
Tim Peters772747b2001-08-09 22:21:55 +00003103
Antoine Pitrouab868312009-01-10 15:40:25 +00003104 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003105 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003107 /* First check for possible aligned read of a C 'long'. Unaligned
3108 reads are more expensive, better to defer to another iteration. */
3109 if (!((size_t) q & LONG_PTR_MASK)) {
3110 /* Fast path for runs of non-surrogate chars. */
3111 register const unsigned char *_q = q;
3112 Py_UNICODE *_p = p;
3113 if (native_ordering) {
3114 /* Native ordering is simple: as long as the input cannot
3115 possibly contain a surrogate char, do an unrolled copy
3116 of several 16-bit code points to the target object.
3117 The non-surrogate check is done on several input bytes
3118 at a time (as many as a C 'long' can contain). */
3119 while (_q < aligned_end) {
3120 unsigned long data = * (unsigned long *) _q;
3121 if (data & FAST_CHAR_MASK)
3122 break;
3123 _p[0] = ((unsigned short *) _q)[0];
3124 _p[1] = ((unsigned short *) _q)[1];
3125#if (SIZEOF_LONG == 8)
3126 _p[2] = ((unsigned short *) _q)[2];
3127 _p[3] = ((unsigned short *) _q)[3];
3128#endif
3129 _q += SIZEOF_LONG;
3130 _p += SIZEOF_LONG / 2;
3131 }
3132 }
3133 else {
3134 /* Byteswapped ordering is similar, but we must decompose
3135 the copy bytewise, and take care of zero'ing out the
3136 upper bytes if the target object is in 32-bit units
3137 (that is, in UCS-4 builds). */
3138 while (_q < aligned_end) {
3139 unsigned long data = * (unsigned long *) _q;
3140 if (data & SWAPPED_FAST_CHAR_MASK)
3141 break;
3142 /* Zero upper bytes in UCS-4 builds */
3143#if (Py_UNICODE_SIZE > 2)
3144 _p[0] = 0;
3145 _p[1] = 0;
3146#if (SIZEOF_LONG == 8)
3147 _p[2] = 0;
3148 _p[3] = 0;
3149#endif
3150#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003151 /* Issue #4916; UCS-4 builds on big endian machines must
3152 fill the two last bytes of each 4-byte unit. */
3153#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3154# define OFF 2
3155#else
3156# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003157#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003158 ((unsigned char *) _p)[OFF + 1] = _q[0];
3159 ((unsigned char *) _p)[OFF + 0] = _q[1];
3160 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3161 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3162#if (SIZEOF_LONG == 8)
3163 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3164 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3165 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3166 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3167#endif
3168#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003169 _q += SIZEOF_LONG;
3170 _p += SIZEOF_LONG / 2;
3171 }
3172 }
3173 p = _p;
3174 q = _q;
3175 if (q >= e)
3176 break;
3177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179
Benjamin Peterson14339b62009-01-31 16:36:08 +00003180 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003181
3182 if (ch < 0xD800 || ch > 0xDFFF) {
3183 *p++ = ch;
3184 continue;
3185 }
3186
3187 /* UTF-16 code pair: */
3188 if (q > e) {
3189 errmsg = "unexpected end of data";
3190 startinpos = (((const char *)q) - 2) - starts;
3191 endinpos = ((const char *)e) + 1 - starts;
3192 goto utf16Error;
3193 }
3194 if (0xD800 <= ch && ch <= 0xDBFF) {
3195 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3196 q += 2;
3197 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003198#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 *p++ = ch;
3200 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003201#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003203#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 continue;
3205 }
3206 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003207 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 startinpos = (((const char *)q)-4)-starts;
3209 endinpos = startinpos+2;
3210 goto utf16Error;
3211 }
3212
Benjamin Peterson14339b62009-01-31 16:36:08 +00003213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 errmsg = "illegal encoding";
3215 startinpos = (((const char *)q)-2)-starts;
3216 endinpos = startinpos+2;
3217 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 utf16Error:
3220 outpos = p - PyUnicode_AS_UNICODE(unicode);
3221 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003222 errors,
3223 &errorHandler,
3224 "utf16", errmsg,
3225 &starts,
3226 (const char **)&e,
3227 &startinpos,
3228 &endinpos,
3229 &exc,
3230 (const char **)&q,
3231 &unicode,
3232 &outpos,
3233 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003236 /* remaining byte at the end? (size should be even) */
3237 if (e == q) {
3238 if (!consumed) {
3239 errmsg = "truncated data";
3240 startinpos = ((const char *)q) - starts;
3241 endinpos = ((const char *)e) + 1 - starts;
3242 outpos = p - PyUnicode_AS_UNICODE(unicode);
3243 if (unicode_decode_call_errorhandler(
3244 errors,
3245 &errorHandler,
3246 "utf16", errmsg,
3247 &starts,
3248 (const char **)&e,
3249 &startinpos,
3250 &endinpos,
3251 &exc,
3252 (const char **)&q,
3253 &unicode,
3254 &outpos,
3255 &p))
3256 goto onError;
3257 /* The remaining input chars are ignored if the callback
3258 chooses to skip the input */
3259 }
3260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
3262 if (byteorder)
3263 *byteorder = bo;
3264
Walter Dörwald69652032004-09-07 20:24:22 +00003265 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003267
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003269 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 goto onError;
3271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 return (PyObject *)unicode;
3275
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 Py_XDECREF(errorHandler);
3279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 return NULL;
3281}
3282
Antoine Pitrouab868312009-01-10 15:40:25 +00003283#undef FAST_CHAR_MASK
3284#undef SWAPPED_FAST_CHAR_MASK
3285
Tim Peters772747b2001-08-09 22:21:55 +00003286PyObject *
3287PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 Py_ssize_t size,
3289 const char *errors,
3290 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003292 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003293 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003294 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003295#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003296 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003297#else
3298 const int pairs = 0;
3299#endif
Tim Peters772747b2001-08-09 22:21:55 +00003300 /* Offsets from p for storing byte pairs in the right order. */
3301#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3302 int ihi = 1, ilo = 0;
3303#else
3304 int ihi = 0, ilo = 1;
3305#endif
3306
Benjamin Peterson29060642009-01-31 22:14:21 +00003307#define STORECHAR(CH) \
3308 do { \
3309 p[ihi] = ((CH) >> 8) & 0xff; \
3310 p[ilo] = (CH) & 0xff; \
3311 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003312 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003314#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003315 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 if (s[i] >= 0x10000)
3317 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003318#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003319 /* 2 * (size + pairs + (byteorder == 0)) */
3320 if (size > PY_SSIZE_T_MAX ||
3321 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003323 nsize = size + pairs + (byteorder == 0);
3324 bytesize = nsize * 2;
3325 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003327 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 if (v == NULL)
3329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003331 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003334 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003335 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003336
3337 if (byteorder == -1) {
3338 /* force LE */
3339 ihi = 1;
3340 ilo = 0;
3341 }
3342 else if (byteorder == 1) {
3343 /* force BE */
3344 ihi = 0;
3345 ilo = 1;
3346 }
3347
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003348 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 Py_UNICODE ch = *s++;
3350 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003351#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003352 if (ch >= 0x10000) {
3353 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3354 ch = 0xD800 | ((ch-0x10000) >> 10);
3355 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003356#endif
Tim Peters772747b2001-08-09 22:21:55 +00003357 STORECHAR(ch);
3358 if (ch2)
3359 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003360 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003361
3362 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003363 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003364#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365}
3366
3367PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3368{
3369 if (!PyUnicode_Check(unicode)) {
3370 PyErr_BadArgument();
3371 return NULL;
3372 }
3373 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 PyUnicode_GET_SIZE(unicode),
3375 NULL,
3376 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377}
3378
3379/* --- Unicode Escape Codec ----------------------------------------------- */
3380
Fredrik Lundh06d12682001-01-24 07:59:11 +00003381static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 Py_ssize_t size,
3385 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003388 Py_ssize_t startinpos;
3389 Py_ssize_t endinpos;
3390 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003395 char* message;
3396 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 PyObject *errorHandler = NULL;
3398 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003399
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 /* Escaped strings will always be longer than the resulting
3401 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 length after conversion to the true value.
3403 (but if the error callback returns a long replacement string
3404 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 v = _PyUnicode_New(size);
3406 if (v == NULL)
3407 goto onError;
3408 if (size == 0)
3409 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003413
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 while (s < end) {
3415 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003416 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
3419 /* Non-escape characters are interpreted as Unicode ordinals */
3420 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003421 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 continue;
3423 }
3424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 /* \ - Escapes */
3427 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003428 c = *s++;
3429 if (s > end)
3430 c = '\0'; /* Invalid after \ */
3431 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 case '\n': break;
3435 case '\\': *p++ = '\\'; break;
3436 case '\'': *p++ = '\''; break;
3437 case '\"': *p++ = '\"'; break;
3438 case 'b': *p++ = '\b'; break;
3439 case 'f': *p++ = '\014'; break; /* FF */
3440 case 't': *p++ = '\t'; break;
3441 case 'n': *p++ = '\n'; break;
3442 case 'r': *p++ = '\r'; break;
3443 case 'v': *p++ = '\013'; break; /* VT */
3444 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3445
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 case '0': case '1': case '2': case '3':
3448 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003449 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003450 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003451 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003452 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003453 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003455 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 break;
3457
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 /* hex escapes */
3459 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003461 digits = 2;
3462 message = "truncated \\xXX escape";
3463 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464
Benjamin Peterson29060642009-01-31 22:14:21 +00003465 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003467 digits = 4;
3468 message = "truncated \\uXXXX escape";
3469 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003472 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003473 digits = 8;
3474 message = "truncated \\UXXXXXXXX escape";
3475 hexescape:
3476 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 outpos = p-PyUnicode_AS_UNICODE(v);
3478 if (s+digits>end) {
3479 endinpos = size;
3480 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 errors, &errorHandler,
3482 "unicodeescape", "end of string in escape sequence",
3483 &starts, &end, &startinpos, &endinpos, &exc, &s,
3484 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 goto onError;
3486 goto nextByte;
3487 }
3488 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003489 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003490 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 endinpos = (s+i+1)-starts;
3492 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003493 errors, &errorHandler,
3494 "unicodeescape", message,
3495 &starts, &end, &startinpos, &endinpos, &exc, &s,
3496 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003499 }
3500 chr = (chr<<4) & ~0xF;
3501 if (c >= '0' && c <= '9')
3502 chr += c - '0';
3503 else if (c >= 'a' && c <= 'f')
3504 chr += 10 + c - 'a';
3505 else
3506 chr += 10 + c - 'A';
3507 }
3508 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003509 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 /* _decoding_error will have already written into the
3511 target buffer. */
3512 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003513 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003514 /* when we get here, chr is a 32-bit unicode character */
3515 if (chr <= 0xffff)
3516 /* UCS-2 character */
3517 *p++ = (Py_UNICODE) chr;
3518 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003519 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003520 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003521#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003522 *p++ = chr;
3523#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003524 chr -= 0x10000L;
3525 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003526 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003527#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003528 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 endinpos = s-starts;
3530 outpos = p-PyUnicode_AS_UNICODE(v);
3531 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003532 errors, &errorHandler,
3533 "unicodeescape", "illegal Unicode character",
3534 &starts, &end, &startinpos, &endinpos, &exc, &s,
3535 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003536 goto onError;
3537 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003538 break;
3539
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003541 case 'N':
3542 message = "malformed \\N character escape";
3543 if (ucnhash_CAPI == NULL) {
3544 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003545 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003546 if (ucnhash_CAPI == NULL)
3547 goto ucnhashError;
3548 }
3549 if (*s == '{') {
3550 const char *start = s+1;
3551 /* look for the closing brace */
3552 while (*s != '}' && s < end)
3553 s++;
3554 if (s > start && s < end && *s == '}') {
3555 /* found a name. look it up in the unicode database */
3556 message = "unknown Unicode character name";
3557 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003558 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003559 goto store;
3560 }
3561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 endinpos = s-starts;
3563 outpos = p-PyUnicode_AS_UNICODE(v);
3564 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 errors, &errorHandler,
3566 "unicodeescape", message,
3567 &starts, &end, &startinpos, &endinpos, &exc, &s,
3568 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003569 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003570 break;
3571
3572 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003573 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 message = "\\ at end of string";
3575 s--;
3576 endinpos = s-starts;
3577 outpos = p-PyUnicode_AS_UNICODE(v);
3578 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 errors, &errorHandler,
3580 "unicodeescape", message,
3581 &starts, &end, &startinpos, &endinpos, &exc, &s,
3582 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003583 goto onError;
3584 }
3585 else {
3586 *p++ = '\\';
3587 *p++ = (unsigned char)s[-1];
3588 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003589 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003594 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003596 Py_XDECREF(errorHandler);
3597 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003599
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003601 PyErr_SetString(
3602 PyExc_UnicodeError,
3603 "\\N escapes not supported (can't load unicodedata module)"
3604 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_XDECREF(errorHandler);
3607 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003608 return NULL;
3609
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 Py_XDECREF(errorHandler);
3613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 return NULL;
3615}
3616
3617/* Return a Unicode-Escape string version of the Unicode object.
3618
3619 If quotes is true, the string is enclosed in u"" or u'' quotes as
3620 appropriate.
3621
3622*/
3623
Thomas Wouters477c8d52006-05-27 19:21:47 +00003624Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 Py_ssize_t size,
3626 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003627{
3628 /* like wcschr, but doesn't stop at NULL characters */
3629
3630 while (size-- > 0) {
3631 if (*s == ch)
3632 return s;
3633 s++;
3634 }
3635
3636 return NULL;
3637}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003638
Walter Dörwald79e913e2007-05-12 11:08:06 +00003639static const char *hexdigits = "0123456789abcdef";
3640
3641PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003644 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003647#ifdef Py_UNICODE_WIDE
3648 const Py_ssize_t expandsize = 10;
3649#else
3650 const Py_ssize_t expandsize = 6;
3651#endif
3652
Thomas Wouters89f507f2006-12-13 04:49:30 +00003653 /* XXX(nnorwitz): rather than over-allocating, it would be
3654 better to choose a different scheme. Perhaps scan the
3655 first N-chars of the string and allocate based on that size.
3656 */
3657 /* Initial allocation is based on the longest-possible unichr
3658 escape.
3659
3660 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3661 unichr, so in this case it's the longest unichr escape. In
3662 narrow (UTF-16) builds this is five chars per source unichr
3663 since there are two unichrs in the surrogate pair, so in narrow
3664 (UTF-16) builds it's not the longest unichr escape.
3665
3666 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3667 so in the narrow (UTF-16) build case it's the longest unichr
3668 escape.
3669 */
3670
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003671 if (size == 0)
3672 return PyBytes_FromStringAndSize(NULL, 0);
3673
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003674 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003676
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003677 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 2
3679 + expandsize*size
3680 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 if (repr == NULL)
3682 return NULL;
3683
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003684 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 while (size-- > 0) {
3687 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003688
Walter Dörwald79e913e2007-05-12 11:08:06 +00003689 /* Escape backslashes */
3690 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 *p++ = '\\';
3692 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003693 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003694 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003695
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003696#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003697 /* Map 21-bit characters to '\U00xxxxxx' */
3698 else if (ch >= 0x10000) {
3699 *p++ = '\\';
3700 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003701 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3702 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3703 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3704 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3705 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3706 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3707 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3708 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003710 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003711#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3713 else if (ch >= 0xD800 && ch < 0xDC00) {
3714 Py_UNICODE ch2;
3715 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003716
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 ch2 = *s++;
3718 size--;
3719 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3720 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3721 *p++ = '\\';
3722 *p++ = 'U';
3723 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3724 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3725 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3726 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3727 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3728 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3729 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3730 *p++ = hexdigits[ucs & 0x0000000F];
3731 continue;
3732 }
3733 /* Fall through: isolated surrogates are copied as-is */
3734 s--;
3735 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003736 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003737#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003740 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 *p++ = '\\';
3742 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003743 *p++ = hexdigits[(ch >> 12) & 0x000F];
3744 *p++ = hexdigits[(ch >> 8) & 0x000F];
3745 *p++ = hexdigits[(ch >> 4) & 0x000F];
3746 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003748
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003749 /* Map special whitespace to '\t', \n', '\r' */
3750 else if (ch == '\t') {
3751 *p++ = '\\';
3752 *p++ = 't';
3753 }
3754 else if (ch == '\n') {
3755 *p++ = '\\';
3756 *p++ = 'n';
3757 }
3758 else if (ch == '\r') {
3759 *p++ = '\\';
3760 *p++ = 'r';
3761 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003762
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003763 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003764 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003766 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003767 *p++ = hexdigits[(ch >> 4) & 0x000F];
3768 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003769 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* Copy everything else as-is */
3772 else
3773 *p++ = (char) ch;
3774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003776 assert(p - PyBytes_AS_STRING(repr) > 0);
3777 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3778 return NULL;
3779 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780}
3781
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003782PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003784 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 if (!PyUnicode_Check(unicode)) {
3786 PyErr_BadArgument();
3787 return NULL;
3788 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003789 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3790 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003791 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792}
3793
3794/* --- Raw Unicode Escape Codec ------------------------------------------- */
3795
3796PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003797 Py_ssize_t size,
3798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003801 Py_ssize_t startinpos;
3802 Py_ssize_t endinpos;
3803 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 const char *end;
3807 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 PyObject *errorHandler = NULL;
3809 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003810
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 /* Escaped strings will always be longer than the resulting
3812 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 length after conversion to the true value. (But decoding error
3814 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 v = _PyUnicode_New(size);
3816 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 end = s + size;
3822 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 unsigned char c;
3824 Py_UCS4 x;
3825 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003826 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 /* Non-escape characters are interpreted as Unicode ordinals */
3829 if (*s != '\\') {
3830 *p++ = (unsigned char)*s++;
3831 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 startinpos = s-starts;
3834
3835 /* \u-escapes are only interpreted iff the number of leading
3836 backslashes if odd */
3837 bs = s;
3838 for (;s < end;) {
3839 if (*s != '\\')
3840 break;
3841 *p++ = (unsigned char)*s++;
3842 }
3843 if (((s - bs) & 1) == 0 ||
3844 s >= end ||
3845 (*s != 'u' && *s != 'U')) {
3846 continue;
3847 }
3848 p--;
3849 count = *s=='u' ? 4 : 8;
3850 s++;
3851
3852 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3853 outpos = p-PyUnicode_AS_UNICODE(v);
3854 for (x = 0, i = 0; i < count; ++i, ++s) {
3855 c = (unsigned char)*s;
3856 if (!ISXDIGIT(c)) {
3857 endinpos = s-starts;
3858 if (unicode_decode_call_errorhandler(
3859 errors, &errorHandler,
3860 "rawunicodeescape", "truncated \\uXXXX",
3861 &starts, &end, &startinpos, &endinpos, &exc, &s,
3862 &v, &outpos, &p))
3863 goto onError;
3864 goto nextByte;
3865 }
3866 x = (x<<4) & ~0xF;
3867 if (c >= '0' && c <= '9')
3868 x += c - '0';
3869 else if (c >= 'a' && c <= 'f')
3870 x += 10 + c - 'a';
3871 else
3872 x += 10 + c - 'A';
3873 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003874 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003875 /* UCS-2 character */
3876 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003877 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 /* UCS-4 character. Either store directly, or as
3879 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003880#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003882#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 x -= 0x10000L;
3884 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3885 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003886#endif
3887 } else {
3888 endinpos = s-starts;
3889 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003890 if (unicode_decode_call_errorhandler(
3891 errors, &errorHandler,
3892 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 &starts, &end, &startinpos, &endinpos, &exc, &s,
3894 &v, &outpos, &p))
3895 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 nextByte:
3898 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003900 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003902 Py_XDECREF(errorHandler);
3903 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003905
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 Py_XDECREF(errorHandler);
3909 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 return NULL;
3911}
3912
3913PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003916 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 char *p;
3918 char *q;
3919
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003920#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003921 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003922#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003923 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003924#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003925
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003926 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003927 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003928
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003929 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 if (repr == NULL)
3931 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003932 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003933 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003935 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 while (size-- > 0) {
3937 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003938#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 /* Map 32-bit characters to '\Uxxxxxxxx' */
3940 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003941 *p++ = '\\';
3942 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003943 *p++ = hexdigits[(ch >> 28) & 0xf];
3944 *p++ = hexdigits[(ch >> 24) & 0xf];
3945 *p++ = hexdigits[(ch >> 20) & 0xf];
3946 *p++ = hexdigits[(ch >> 16) & 0xf];
3947 *p++ = hexdigits[(ch >> 12) & 0xf];
3948 *p++ = hexdigits[(ch >> 8) & 0xf];
3949 *p++ = hexdigits[(ch >> 4) & 0xf];
3950 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003951 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003952 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003953#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3955 if (ch >= 0xD800 && ch < 0xDC00) {
3956 Py_UNICODE ch2;
3957 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003958
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 ch2 = *s++;
3960 size--;
3961 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3962 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3963 *p++ = '\\';
3964 *p++ = 'U';
3965 *p++ = hexdigits[(ucs >> 28) & 0xf];
3966 *p++ = hexdigits[(ucs >> 24) & 0xf];
3967 *p++ = hexdigits[(ucs >> 20) & 0xf];
3968 *p++ = hexdigits[(ucs >> 16) & 0xf];
3969 *p++ = hexdigits[(ucs >> 12) & 0xf];
3970 *p++ = hexdigits[(ucs >> 8) & 0xf];
3971 *p++ = hexdigits[(ucs >> 4) & 0xf];
3972 *p++ = hexdigits[ucs & 0xf];
3973 continue;
3974 }
3975 /* Fall through: isolated surrogates are copied as-is */
3976 s--;
3977 size++;
3978 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003979#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 /* Map 16-bit characters to '\uxxxx' */
3981 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 *p++ = '\\';
3983 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003984 *p++ = hexdigits[(ch >> 12) & 0xf];
3985 *p++ = hexdigits[(ch >> 8) & 0xf];
3986 *p++ = hexdigits[(ch >> 4) & 0xf];
3987 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 /* Copy everything else as-is */
3990 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 *p++ = (char) ch;
3992 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003993 size = p - q;
3994
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003995 assert(size > 0);
3996 if (_PyBytes_Resize(&repr, size) < 0)
3997 return NULL;
3998 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999}
4000
4001PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4002{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004003 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004005 PyErr_BadArgument();
4006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004008 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4009 PyUnicode_GET_SIZE(unicode));
4010
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004011 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012}
4013
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004014/* --- Unicode Internal Codec ------------------------------------------- */
4015
4016PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 Py_ssize_t size,
4018 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004019{
4020 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004021 Py_ssize_t startinpos;
4022 Py_ssize_t endinpos;
4023 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004024 PyUnicodeObject *v;
4025 Py_UNICODE *p;
4026 const char *end;
4027 const char *reason;
4028 PyObject *errorHandler = NULL;
4029 PyObject *exc = NULL;
4030
Neal Norwitzd43069c2006-01-08 01:12:10 +00004031#ifdef Py_UNICODE_WIDE
4032 Py_UNICODE unimax = PyUnicode_GetMax();
4033#endif
4034
Thomas Wouters89f507f2006-12-13 04:49:30 +00004035 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004036 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4037 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004039 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004041 p = PyUnicode_AS_UNICODE(v);
4042 end = s + size;
4043
4044 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004045 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004046 /* We have to sanity check the raw data, otherwise doom looms for
4047 some malformed UCS-4 data. */
4048 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004049#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004050 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004051#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004052 end-s < Py_UNICODE_SIZE
4053 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004055 startinpos = s - starts;
4056 if (end-s < Py_UNICODE_SIZE) {
4057 endinpos = end-starts;
4058 reason = "truncated input";
4059 }
4060 else {
4061 endinpos = s - starts + Py_UNICODE_SIZE;
4062 reason = "illegal code point (> 0x10FFFF)";
4063 }
4064 outpos = p - PyUnicode_AS_UNICODE(v);
4065 if (unicode_decode_call_errorhandler(
4066 errors, &errorHandler,
4067 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004068 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004069 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004070 goto onError;
4071 }
4072 }
4073 else {
4074 p++;
4075 s += Py_UNICODE_SIZE;
4076 }
4077 }
4078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004079 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004080 goto onError;
4081 Py_XDECREF(errorHandler);
4082 Py_XDECREF(exc);
4083 return (PyObject *)v;
4084
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004086 Py_XDECREF(v);
4087 Py_XDECREF(errorHandler);
4088 Py_XDECREF(exc);
4089 return NULL;
4090}
4091
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092/* --- Latin-1 Codec ------------------------------------------------------ */
4093
4094PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 Py_ssize_t size,
4096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097{
4098 PyUnicodeObject *v;
4099 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004100 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004101
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004103 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 Py_UNICODE r = *(unsigned char*)s;
4105 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004106 }
4107
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 v = _PyUnicode_New(size);
4109 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004114 e = s + size;
4115 /* Unrolling the copy makes it much faster by reducing the looping
4116 overhead. This is similar to what many memcpy() implementations do. */
4117 unrolled_end = e - 4;
4118 while (s < unrolled_end) {
4119 p[0] = (unsigned char) s[0];
4120 p[1] = (unsigned char) s[1];
4121 p[2] = (unsigned char) s[2];
4122 p[3] = (unsigned char) s[3];
4123 s += 4;
4124 p += 4;
4125 }
4126 while (s < e)
4127 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 Py_XDECREF(v);
4132 return NULL;
4133}
4134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135/* create or adjust a UnicodeEncodeError */
4136static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 const char *encoding,
4138 const Py_UNICODE *unicode, Py_ssize_t size,
4139 Py_ssize_t startpos, Py_ssize_t endpos,
4140 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 *exceptionObject = PyUnicodeEncodeError_Create(
4144 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 }
4146 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4148 goto onError;
4149 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4150 goto onError;
4151 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4152 goto onError;
4153 return;
4154 onError:
4155 Py_DECREF(*exceptionObject);
4156 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 }
4158}
4159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160/* raises a UnicodeEncodeError */
4161static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 const char *encoding,
4163 const Py_UNICODE *unicode, Py_ssize_t size,
4164 Py_ssize_t startpos, Py_ssize_t endpos,
4165 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166{
4167 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171}
4172
4173/* error handling callback helper:
4174 build arguments, call the callback and check the arguments,
4175 put the result into newpos and return the replacement string, which
4176 has to be freed by the caller */
4177static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 PyObject **errorHandler,
4179 const char *encoding, const char *reason,
4180 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4181 Py_ssize_t startpos, Py_ssize_t endpos,
4182 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004184 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185
4186 PyObject *restuple;
4187 PyObject *resunicode;
4188
4189 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
4194
4195 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199
4200 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004205 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 Py_DECREF(restuple);
4207 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004209 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 &resunicode, newpos)) {
4211 Py_DECREF(restuple);
4212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004214 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4215 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4216 Py_DECREF(restuple);
4217 return NULL;
4218 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004221 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004222 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4223 Py_DECREF(restuple);
4224 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004225 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 Py_INCREF(resunicode);
4227 Py_DECREF(restuple);
4228 return resunicode;
4229}
4230
4231static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 Py_ssize_t size,
4233 const char *errors,
4234 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235{
4236 /* output object */
4237 PyObject *res;
4238 /* pointers to the beginning and end+1 of input */
4239 const Py_UNICODE *startp = p;
4240 const Py_UNICODE *endp = p + size;
4241 /* pointer to the beginning of the unencodable characters */
4242 /* const Py_UNICODE *badp = NULL; */
4243 /* pointer into the output */
4244 char *str;
4245 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004247 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4248 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 PyObject *errorHandler = NULL;
4250 PyObject *exc = NULL;
4251 /* the following variable is used for caching string comparisons
4252 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4253 int known_errorHandler = -1;
4254
4255 /* allocate enough for a simple encoding without
4256 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004257 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004258 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004259 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004261 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004262 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 ressize = size;
4264
4265 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 /* can we encode this? */
4269 if (c<limit) {
4270 /* no overflow check, because we know that the space is enough */
4271 *str++ = (char)c;
4272 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 else {
4275 Py_ssize_t unicodepos = p-startp;
4276 Py_ssize_t requiredsize;
4277 PyObject *repunicode;
4278 Py_ssize_t repsize;
4279 Py_ssize_t newpos;
4280 Py_ssize_t respos;
4281 Py_UNICODE *uni2;
4282 /* startpos for collecting unencodable chars */
4283 const Py_UNICODE *collstart = p;
4284 const Py_UNICODE *collend = p;
4285 /* find all unecodable characters */
4286 while ((collend < endp) && ((*collend)>=limit))
4287 ++collend;
4288 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4289 if (known_errorHandler==-1) {
4290 if ((errors==NULL) || (!strcmp(errors, "strict")))
4291 known_errorHandler = 1;
4292 else if (!strcmp(errors, "replace"))
4293 known_errorHandler = 2;
4294 else if (!strcmp(errors, "ignore"))
4295 known_errorHandler = 3;
4296 else if (!strcmp(errors, "xmlcharrefreplace"))
4297 known_errorHandler = 4;
4298 else
4299 known_errorHandler = 0;
4300 }
4301 switch (known_errorHandler) {
4302 case 1: /* strict */
4303 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4304 goto onError;
4305 case 2: /* replace */
4306 while (collstart++<collend)
4307 *str++ = '?'; /* fall through */
4308 case 3: /* ignore */
4309 p = collend;
4310 break;
4311 case 4: /* xmlcharrefreplace */
4312 respos = str - PyBytes_AS_STRING(res);
4313 /* determine replacement size (temporarily (mis)uses p) */
4314 for (p = collstart, repsize = 0; p < collend; ++p) {
4315 if (*p<10)
4316 repsize += 2+1+1;
4317 else if (*p<100)
4318 repsize += 2+2+1;
4319 else if (*p<1000)
4320 repsize += 2+3+1;
4321 else if (*p<10000)
4322 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004323#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 else
4325 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004326#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 else if (*p<100000)
4328 repsize += 2+5+1;
4329 else if (*p<1000000)
4330 repsize += 2+6+1;
4331 else
4332 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004333#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 }
4335 requiredsize = respos+repsize+(endp-collend);
4336 if (requiredsize > ressize) {
4337 if (requiredsize<2*ressize)
4338 requiredsize = 2*ressize;
4339 if (_PyBytes_Resize(&res, requiredsize))
4340 goto onError;
4341 str = PyBytes_AS_STRING(res) + respos;
4342 ressize = requiredsize;
4343 }
4344 /* generate replacement (temporarily (mis)uses p) */
4345 for (p = collstart; p < collend; ++p) {
4346 str += sprintf(str, "&#%d;", (int)*p);
4347 }
4348 p = collend;
4349 break;
4350 default:
4351 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4352 encoding, reason, startp, size, &exc,
4353 collstart-startp, collend-startp, &newpos);
4354 if (repunicode == NULL)
4355 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004356 if (PyBytes_Check(repunicode)) {
4357 /* Directly copy bytes result to output. */
4358 repsize = PyBytes_Size(repunicode);
4359 if (repsize > 1) {
4360 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004361 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004362 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4363 Py_DECREF(repunicode);
4364 goto onError;
4365 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004366 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004367 ressize += repsize-1;
4368 }
4369 memcpy(str, PyBytes_AsString(repunicode), repsize);
4370 str += repsize;
4371 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004372 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004373 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 /* need more space? (at least enough for what we
4376 have+the replacement+the rest of the string, so
4377 we won't have to check space for encodable characters) */
4378 respos = str - PyBytes_AS_STRING(res);
4379 repsize = PyUnicode_GET_SIZE(repunicode);
4380 requiredsize = respos+repsize+(endp-collend);
4381 if (requiredsize > ressize) {
4382 if (requiredsize<2*ressize)
4383 requiredsize = 2*ressize;
4384 if (_PyBytes_Resize(&res, requiredsize)) {
4385 Py_DECREF(repunicode);
4386 goto onError;
4387 }
4388 str = PyBytes_AS_STRING(res) + respos;
4389 ressize = requiredsize;
4390 }
4391 /* check if there is anything unencodable in the replacement
4392 and copy it to the output */
4393 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4394 c = *uni2;
4395 if (c >= limit) {
4396 raise_encode_exception(&exc, encoding, startp, size,
4397 unicodepos, unicodepos+1, reason);
4398 Py_DECREF(repunicode);
4399 goto onError;
4400 }
4401 *str = (char)c;
4402 }
4403 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004404 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004406 }
4407 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004408 /* Resize if we allocated to much */
4409 size = str - PyBytes_AS_STRING(res);
4410 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004411 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004412 if (_PyBytes_Resize(&res, size) < 0)
4413 goto onError;
4414 }
4415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 Py_XDECREF(errorHandler);
4417 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004418 return res;
4419
4420 onError:
4421 Py_XDECREF(res);
4422 Py_XDECREF(errorHandler);
4423 Py_XDECREF(exc);
4424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425}
4426
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 Py_ssize_t size,
4429 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432}
4433
4434PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4435{
4436 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 PyErr_BadArgument();
4438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 }
4440 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 PyUnicode_GET_SIZE(unicode),
4442 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443}
4444
4445/* --- 7-bit ASCII Codec -------------------------------------------------- */
4446
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 Py_ssize_t size,
4449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 PyUnicodeObject *v;
4453 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 Py_ssize_t startinpos;
4455 Py_ssize_t endinpos;
4456 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 const char *e;
4458 PyObject *errorHandler = NULL;
4459 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004462 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 Py_UNICODE r = *(unsigned char*)s;
4464 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004465 }
Tim Petersced69f82003-09-16 20:30:58 +00004466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 v = _PyUnicode_New(size);
4468 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 e = s + size;
4474 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 register unsigned char c = (unsigned char)*s;
4476 if (c < 128) {
4477 *p++ = c;
4478 ++s;
4479 }
4480 else {
4481 startinpos = s-starts;
4482 endinpos = startinpos + 1;
4483 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4484 if (unicode_decode_call_errorhandler(
4485 errors, &errorHandler,
4486 "ascii", "ordinal not in range(128)",
4487 &starts, &e, &startinpos, &endinpos, &exc, &s,
4488 &v, &outpos, &p))
4489 goto onError;
4490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004492 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_XDECREF(errorHandler);
4496 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 Py_XDECREF(errorHandler);
4502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 return NULL;
4504}
4505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 Py_ssize_t size,
4508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511}
4512
4513PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4514{
4515 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 PyErr_BadArgument();
4517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
4519 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 PyUnicode_GET_SIZE(unicode),
4521 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522}
4523
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004524#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004525
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004526/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004527
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004528#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004529#define NEED_RETRY
4530#endif
4531
4532/* XXX This code is limited to "true" double-byte encodings, as
4533 a) it assumes an incomplete character consists of a single byte, and
4534 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004536
4537static int is_dbcs_lead_byte(const char *s, int offset)
4538{
4539 const char *curr = s + offset;
4540
4541 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 const char *prev = CharPrev(s, curr);
4543 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004544 }
4545 return 0;
4546}
4547
4548/*
4549 * Decode MBCS string into unicode object. If 'final' is set, converts
4550 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4551 */
4552static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 const char *s, /* MBCS string */
4554 int size, /* sizeof MBCS string */
4555 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004556{
4557 Py_UNICODE *p;
4558 Py_ssize_t n = 0;
4559 int usize = 0;
4560
4561 assert(size >= 0);
4562
4563 /* Skip trailing lead-byte unless 'final' is set */
4564 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566
4567 /* First get the size of the result */
4568 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4570 if (usize == 0) {
4571 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4572 return -1;
4573 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004574 }
4575
4576 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 /* Create unicode object */
4578 *v = _PyUnicode_New(usize);
4579 if (*v == NULL)
4580 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581 }
4582 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 /* Extend unicode object */
4584 n = PyUnicode_GET_SIZE(*v);
4585 if (_PyUnicode_Resize(v, n + usize) < 0)
4586 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587 }
4588
4589 /* Do the conversion */
4590 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 p = PyUnicode_AS_UNICODE(*v) + n;
4592 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4593 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4594 return -1;
4595 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004596 }
4597
4598 return size;
4599}
4600
4601PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 Py_ssize_t size,
4603 const char *errors,
4604 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605{
4606 PyUnicodeObject *v = NULL;
4607 int done;
4608
4609 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004611
4612#ifdef NEED_RETRY
4613 retry:
4614 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616 else
4617#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004619
4620 if (done < 0) {
4621 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004623 }
4624
4625 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004627
4628#ifdef NEED_RETRY
4629 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 s += done;
4631 size -= done;
4632 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004633 }
4634#endif
4635
4636 return (PyObject *)v;
4637}
4638
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004639PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 Py_ssize_t size,
4641 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004642{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004643 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4644}
4645
4646/*
4647 * Convert unicode into string object (MBCS).
4648 * Returns 0 if succeed, -1 otherwise.
4649 */
4650static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 const Py_UNICODE *p, /* unicode */
4652 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653{
4654 int mbcssize = 0;
4655 Py_ssize_t n = 0;
4656
4657 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004658
4659 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004660 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4662 if (mbcssize == 0) {
4663 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4664 return -1;
4665 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004666 }
4667
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004668 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 /* Create string object */
4670 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4671 if (*repr == NULL)
4672 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004673 }
4674 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 /* Extend string object */
4676 n = PyBytes_Size(*repr);
4677 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4678 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004679 }
4680
4681 /* Do the conversion */
4682 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 char *s = PyBytes_AS_STRING(*repr) + n;
4684 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4685 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4686 return -1;
4687 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004688 }
4689
4690 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004691}
4692
4693PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 Py_ssize_t size,
4695 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004696{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697 PyObject *repr = NULL;
4698 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004699
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004700#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004702 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004704 else
4705#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004707
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004708 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 Py_XDECREF(repr);
4710 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004711 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004712
4713#ifdef NEED_RETRY
4714 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 p += INT_MAX;
4716 size -= INT_MAX;
4717 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004718 }
4719#endif
4720
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004721 return repr;
4722}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004723
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004724PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4725{
4726 if (!PyUnicode_Check(unicode)) {
4727 PyErr_BadArgument();
4728 return NULL;
4729 }
4730 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 PyUnicode_GET_SIZE(unicode),
4732 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004733}
4734
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004735#undef NEED_RETRY
4736
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004737#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004738
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739/* --- Character Mapping Codec -------------------------------------------- */
4740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 Py_ssize_t size,
4743 PyObject *mapping,
4744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t startinpos;
4748 Py_ssize_t endinpos;
4749 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 PyUnicodeObject *v;
4752 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004753 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 PyObject *errorHandler = NULL;
4755 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004756 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004757 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004758
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 /* Default to Latin-1 */
4760 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762
4763 v = _PyUnicode_New(size);
4764 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004770 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 mapstring = PyUnicode_AS_UNICODE(mapping);
4772 maplen = PyUnicode_GET_SIZE(mapping);
4773 while (s < e) {
4774 unsigned char ch = *s;
4775 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 if (ch < maplen)
4778 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (x == 0xfffe) {
4781 /* undefined mapping */
4782 outpos = p-PyUnicode_AS_UNICODE(v);
4783 startinpos = s-starts;
4784 endinpos = startinpos+1;
4785 if (unicode_decode_call_errorhandler(
4786 errors, &errorHandler,
4787 "charmap", "character maps to <undefined>",
4788 &starts, &e, &startinpos, &endinpos, &exc, &s,
4789 &v, &outpos, &p)) {
4790 goto onError;
4791 }
4792 continue;
4793 }
4794 *p++ = x;
4795 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004796 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004797 }
4798 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 while (s < e) {
4800 unsigned char ch = *s;
4801 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004802
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4804 w = PyLong_FromLong((long)ch);
4805 if (w == NULL)
4806 goto onError;
4807 x = PyObject_GetItem(mapping, w);
4808 Py_DECREF(w);
4809 if (x == NULL) {
4810 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4811 /* No mapping found means: mapping is undefined. */
4812 PyErr_Clear();
4813 x = Py_None;
4814 Py_INCREF(x);
4815 } else
4816 goto onError;
4817 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004818
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 /* Apply mapping */
4820 if (PyLong_Check(x)) {
4821 long value = PyLong_AS_LONG(x);
4822 if (value < 0 || value > 65535) {
4823 PyErr_SetString(PyExc_TypeError,
4824 "character mapping must be in range(65536)");
4825 Py_DECREF(x);
4826 goto onError;
4827 }
4828 *p++ = (Py_UNICODE)value;
4829 }
4830 else if (x == Py_None) {
4831 /* undefined mapping */
4832 outpos = p-PyUnicode_AS_UNICODE(v);
4833 startinpos = s-starts;
4834 endinpos = startinpos+1;
4835 if (unicode_decode_call_errorhandler(
4836 errors, &errorHandler,
4837 "charmap", "character maps to <undefined>",
4838 &starts, &e, &startinpos, &endinpos, &exc, &s,
4839 &v, &outpos, &p)) {
4840 Py_DECREF(x);
4841 goto onError;
4842 }
4843 Py_DECREF(x);
4844 continue;
4845 }
4846 else if (PyUnicode_Check(x)) {
4847 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004848
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 if (targetsize == 1)
4850 /* 1-1 mapping */
4851 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004852
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 else if (targetsize > 1) {
4854 /* 1-n mapping */
4855 if (targetsize > extrachars) {
4856 /* resize first */
4857 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4858 Py_ssize_t needed = (targetsize - extrachars) + \
4859 (targetsize << 2);
4860 extrachars += needed;
4861 /* XXX overflow detection missing */
4862 if (_PyUnicode_Resize(&v,
4863 PyUnicode_GET_SIZE(v) + needed) < 0) {
4864 Py_DECREF(x);
4865 goto onError;
4866 }
4867 p = PyUnicode_AS_UNICODE(v) + oldpos;
4868 }
4869 Py_UNICODE_COPY(p,
4870 PyUnicode_AS_UNICODE(x),
4871 targetsize);
4872 p += targetsize;
4873 extrachars -= targetsize;
4874 }
4875 /* 1-0 mapping: skip the character */
4876 }
4877 else {
4878 /* wrong return value */
4879 PyErr_SetString(PyExc_TypeError,
4880 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004881 Py_DECREF(x);
4882 goto onError;
4883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 Py_DECREF(x);
4885 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 }
4888 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4890 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 Py_XDECREF(errorHandler);
4892 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004894
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 Py_XDECREF(errorHandler);
4897 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 Py_XDECREF(v);
4899 return NULL;
4900}
4901
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004902/* Charmap encoding: the lookup table */
4903
4904struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 PyObject_HEAD
4906 unsigned char level1[32];
4907 int count2, count3;
4908 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004909};
4910
4911static PyObject*
4912encoding_map_size(PyObject *obj, PyObject* args)
4913{
4914 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004915 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004917}
4918
4919static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004920 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 PyDoc_STR("Return the size (in bytes) of this object") },
4922 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004923};
4924
4925static void
4926encoding_map_dealloc(PyObject* o)
4927{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004928 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004929}
4930
4931static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004932 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 "EncodingMap", /*tp_name*/
4934 sizeof(struct encoding_map), /*tp_basicsize*/
4935 0, /*tp_itemsize*/
4936 /* methods */
4937 encoding_map_dealloc, /*tp_dealloc*/
4938 0, /*tp_print*/
4939 0, /*tp_getattr*/
4940 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004941 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 0, /*tp_repr*/
4943 0, /*tp_as_number*/
4944 0, /*tp_as_sequence*/
4945 0, /*tp_as_mapping*/
4946 0, /*tp_hash*/
4947 0, /*tp_call*/
4948 0, /*tp_str*/
4949 0, /*tp_getattro*/
4950 0, /*tp_setattro*/
4951 0, /*tp_as_buffer*/
4952 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4953 0, /*tp_doc*/
4954 0, /*tp_traverse*/
4955 0, /*tp_clear*/
4956 0, /*tp_richcompare*/
4957 0, /*tp_weaklistoffset*/
4958 0, /*tp_iter*/
4959 0, /*tp_iternext*/
4960 encoding_map_methods, /*tp_methods*/
4961 0, /*tp_members*/
4962 0, /*tp_getset*/
4963 0, /*tp_base*/
4964 0, /*tp_dict*/
4965 0, /*tp_descr_get*/
4966 0, /*tp_descr_set*/
4967 0, /*tp_dictoffset*/
4968 0, /*tp_init*/
4969 0, /*tp_alloc*/
4970 0, /*tp_new*/
4971 0, /*tp_free*/
4972 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004973};
4974
4975PyObject*
4976PyUnicode_BuildEncodingMap(PyObject* string)
4977{
4978 Py_UNICODE *decode;
4979 PyObject *result;
4980 struct encoding_map *mresult;
4981 int i;
4982 int need_dict = 0;
4983 unsigned char level1[32];
4984 unsigned char level2[512];
4985 unsigned char *mlevel1, *mlevel2, *mlevel3;
4986 int count2 = 0, count3 = 0;
4987
4988 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4989 PyErr_BadArgument();
4990 return NULL;
4991 }
4992 decode = PyUnicode_AS_UNICODE(string);
4993 memset(level1, 0xFF, sizeof level1);
4994 memset(level2, 0xFF, sizeof level2);
4995
4996 /* If there isn't a one-to-one mapping of NULL to \0,
4997 or if there are non-BMP characters, we need to use
4998 a mapping dictionary. */
4999 if (decode[0] != 0)
5000 need_dict = 1;
5001 for (i = 1; i < 256; i++) {
5002 int l1, l2;
5003 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005004#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005005 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005006#endif
5007 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005008 need_dict = 1;
5009 break;
5010 }
5011 if (decode[i] == 0xFFFE)
5012 /* unmapped character */
5013 continue;
5014 l1 = decode[i] >> 11;
5015 l2 = decode[i] >> 7;
5016 if (level1[l1] == 0xFF)
5017 level1[l1] = count2++;
5018 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005020 }
5021
5022 if (count2 >= 0xFF || count3 >= 0xFF)
5023 need_dict = 1;
5024
5025 if (need_dict) {
5026 PyObject *result = PyDict_New();
5027 PyObject *key, *value;
5028 if (!result)
5029 return NULL;
5030 for (i = 0; i < 256; i++) {
5031 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005032 key = PyLong_FromLong(decode[i]);
5033 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005034 if (!key || !value)
5035 goto failed1;
5036 if (PyDict_SetItem(result, key, value) == -1)
5037 goto failed1;
5038 Py_DECREF(key);
5039 Py_DECREF(value);
5040 }
5041 return result;
5042 failed1:
5043 Py_XDECREF(key);
5044 Py_XDECREF(value);
5045 Py_DECREF(result);
5046 return NULL;
5047 }
5048
5049 /* Create a three-level trie */
5050 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5051 16*count2 + 128*count3 - 1);
5052 if (!result)
5053 return PyErr_NoMemory();
5054 PyObject_Init(result, &EncodingMapType);
5055 mresult = (struct encoding_map*)result;
5056 mresult->count2 = count2;
5057 mresult->count3 = count3;
5058 mlevel1 = mresult->level1;
5059 mlevel2 = mresult->level23;
5060 mlevel3 = mresult->level23 + 16*count2;
5061 memcpy(mlevel1, level1, 32);
5062 memset(mlevel2, 0xFF, 16*count2);
5063 memset(mlevel3, 0, 128*count3);
5064 count3 = 0;
5065 for (i = 1; i < 256; i++) {
5066 int o1, o2, o3, i2, i3;
5067 if (decode[i] == 0xFFFE)
5068 /* unmapped character */
5069 continue;
5070 o1 = decode[i]>>11;
5071 o2 = (decode[i]>>7) & 0xF;
5072 i2 = 16*mlevel1[o1] + o2;
5073 if (mlevel2[i2] == 0xFF)
5074 mlevel2[i2] = count3++;
5075 o3 = decode[i] & 0x7F;
5076 i3 = 128*mlevel2[i2] + o3;
5077 mlevel3[i3] = i;
5078 }
5079 return result;
5080}
5081
5082static int
5083encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5084{
5085 struct encoding_map *map = (struct encoding_map*)mapping;
5086 int l1 = c>>11;
5087 int l2 = (c>>7) & 0xF;
5088 int l3 = c & 0x7F;
5089 int i;
5090
5091#ifdef Py_UNICODE_WIDE
5092 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005094 }
5095#endif
5096 if (c == 0)
5097 return 0;
5098 /* level 1*/
5099 i = map->level1[l1];
5100 if (i == 0xFF) {
5101 return -1;
5102 }
5103 /* level 2*/
5104 i = map->level23[16*i+l2];
5105 if (i == 0xFF) {
5106 return -1;
5107 }
5108 /* level 3 */
5109 i = map->level23[16*map->count2 + 128*i + l3];
5110 if (i == 0) {
5111 return -1;
5112 }
5113 return i;
5114}
5115
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116/* Lookup the character ch in the mapping. If the character
5117 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005118 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120{
Christian Heimes217cfd12007-12-02 14:31:20 +00005121 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *x;
5123
5124 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 x = PyObject_GetItem(mapping, w);
5127 Py_DECREF(w);
5128 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5130 /* No mapping found means: mapping is undefined. */
5131 PyErr_Clear();
5132 x = Py_None;
5133 Py_INCREF(x);
5134 return x;
5135 } else
5136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005138 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005140 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 long value = PyLong_AS_LONG(x);
5142 if (value < 0 || value > 255) {
5143 PyErr_SetString(PyExc_TypeError,
5144 "character mapping must be in range(256)");
5145 Py_DECREF(x);
5146 return NULL;
5147 }
5148 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005150 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 /* wrong return value */
5154 PyErr_Format(PyExc_TypeError,
5155 "character mapping must return integer, bytes or None, not %.400s",
5156 x->ob_type->tp_name);
5157 Py_DECREF(x);
5158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 }
5160}
5161
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005162static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005163charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005164{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005165 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5166 /* exponentially overallocate to minimize reallocations */
5167 if (requiredsize < 2*outsize)
5168 requiredsize = 2*outsize;
5169 if (_PyBytes_Resize(outobj, requiredsize))
5170 return -1;
5171 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005172}
5173
Benjamin Peterson14339b62009-01-31 16:36:08 +00005174typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005176}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005178 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 space is available. Return a new reference to the object that
5180 was put in the output buffer, or Py_None, if the mapping was undefined
5181 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005182 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005184charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005187 PyObject *rep;
5188 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005189 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190
Christian Heimes90aa7642007-12-19 02:45:37 +00005191 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005192 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005194 if (res == -1)
5195 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 if (outsize<requiredsize)
5197 if (charmapencode_resize(outobj, outpos, requiredsize))
5198 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005199 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 outstart[(*outpos)++] = (char)res;
5201 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005202 }
5203
5204 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005207 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_DECREF(rep);
5209 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005210 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 if (PyLong_Check(rep)) {
5212 Py_ssize_t requiredsize = *outpos+1;
5213 if (outsize<requiredsize)
5214 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5215 Py_DECREF(rep);
5216 return enc_EXCEPTION;
5217 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005218 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 else {
5222 const char *repchars = PyBytes_AS_STRING(rep);
5223 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5224 Py_ssize_t requiredsize = *outpos+repsize;
5225 if (outsize<requiredsize)
5226 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5227 Py_DECREF(rep);
5228 return enc_EXCEPTION;
5229 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005230 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 memcpy(outstart + *outpos, repchars, repsize);
5232 *outpos += repsize;
5233 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005235 Py_DECREF(rep);
5236 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237}
5238
5239/* handle an error in PyUnicode_EncodeCharmap
5240 Return 0 on success, -1 on error */
5241static
5242int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005245 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005246 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247{
5248 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249 Py_ssize_t repsize;
5250 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 Py_UNICODE *uni2;
5252 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005253 Py_ssize_t collstartpos = *inpos;
5254 Py_ssize_t collendpos = *inpos+1;
5255 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256 char *encoding = "charmap";
5257 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005258 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005260 /* find all unencodable characters */
5261 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005262 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005263 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 int res = encoding_map_lookup(p[collendpos], mapping);
5265 if (res != -1)
5266 break;
5267 ++collendpos;
5268 continue;
5269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005270
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 rep = charmapencode_lookup(p[collendpos], mapping);
5272 if (rep==NULL)
5273 return -1;
5274 else if (rep!=Py_None) {
5275 Py_DECREF(rep);
5276 break;
5277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005278 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 }
5281 /* cache callback name lookup
5282 * (if not done yet, i.e. it's the first error) */
5283 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 if ((errors==NULL) || (!strcmp(errors, "strict")))
5285 *known_errorHandler = 1;
5286 else if (!strcmp(errors, "replace"))
5287 *known_errorHandler = 2;
5288 else if (!strcmp(errors, "ignore"))
5289 *known_errorHandler = 3;
5290 else if (!strcmp(errors, "xmlcharrefreplace"))
5291 *known_errorHandler = 4;
5292 else
5293 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 }
5295 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005296 case 1: /* strict */
5297 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5298 return -1;
5299 case 2: /* replace */
5300 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 x = charmapencode_output('?', mapping, res, respos);
5302 if (x==enc_EXCEPTION) {
5303 return -1;
5304 }
5305 else if (x==enc_FAILED) {
5306 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5307 return -1;
5308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005309 }
5310 /* fall through */
5311 case 3: /* ignore */
5312 *inpos = collendpos;
5313 break;
5314 case 4: /* xmlcharrefreplace */
5315 /* generate replacement (temporarily (mis)uses p) */
5316 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 char buffer[2+29+1+1];
5318 char *cp;
5319 sprintf(buffer, "&#%d;", (int)p[collpos]);
5320 for (cp = buffer; *cp; ++cp) {
5321 x = charmapencode_output(*cp, mapping, res, respos);
5322 if (x==enc_EXCEPTION)
5323 return -1;
5324 else if (x==enc_FAILED) {
5325 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5326 return -1;
5327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328 }
5329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005330 *inpos = collendpos;
5331 break;
5332 default:
5333 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 encoding, reason, p, size, exceptionObject,
5335 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005336 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005338 if (PyBytes_Check(repunicode)) {
5339 /* Directly copy bytes result to output. */
5340 Py_ssize_t outsize = PyBytes_Size(*res);
5341 Py_ssize_t requiredsize;
5342 repsize = PyBytes_Size(repunicode);
5343 requiredsize = *respos + repsize;
5344 if (requiredsize > outsize)
5345 /* Make room for all additional bytes. */
5346 if (charmapencode_resize(res, respos, requiredsize)) {
5347 Py_DECREF(repunicode);
5348 return -1;
5349 }
5350 memcpy(PyBytes_AsString(*res) + *respos,
5351 PyBytes_AsString(repunicode), repsize);
5352 *respos += repsize;
5353 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005354 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005355 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357 /* generate replacement */
5358 repsize = PyUnicode_GET_SIZE(repunicode);
5359 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 x = charmapencode_output(*uni2, mapping, res, respos);
5361 if (x==enc_EXCEPTION) {
5362 return -1;
5363 }
5364 else if (x==enc_FAILED) {
5365 Py_DECREF(repunicode);
5366 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5367 return -1;
5368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005369 }
5370 *inpos = newpos;
5371 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 }
5373 return 0;
5374}
5375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 Py_ssize_t size,
5378 PyObject *mapping,
5379 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381 /* output object */
5382 PyObject *res = NULL;
5383 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005384 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 PyObject *errorHandler = NULL;
5388 PyObject *exc = NULL;
5389 /* the following variable is used for caching string comparisons
5390 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5391 * 3=ignore, 4=xmlcharrefreplace */
5392 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
5394 /* Default to Latin-1 */
5395 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 /* allocate enough for a simple encoding without
5399 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005400 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 if (res == NULL)
5402 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005403 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 /* try to encode it */
5408 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5409 if (x==enc_EXCEPTION) /* error */
5410 goto onError;
5411 if (x==enc_FAILED) { /* unencodable character */
5412 if (charmap_encoding_error(p, size, &inpos, mapping,
5413 &exc,
5414 &known_errorHandler, &errorHandler, errors,
5415 &res, &respos)) {
5416 goto onError;
5417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005418 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 else
5420 /* done with this character => adjust input position */
5421 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005424 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005425 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005426 if (_PyBytes_Resize(&res, respos) < 0)
5427 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 Py_XDECREF(exc);
5430 Py_XDECREF(errorHandler);
5431 return res;
5432
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 Py_XDECREF(res);
5435 Py_XDECREF(exc);
5436 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 return NULL;
5438}
5439
5440PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442{
5443 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 PyErr_BadArgument();
5445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
5447 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 PyUnicode_GET_SIZE(unicode),
5449 mapping,
5450 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451}
5452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453/* create or adjust a UnicodeTranslateError */
5454static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 const Py_UNICODE *unicode, Py_ssize_t size,
5456 Py_ssize_t startpos, Py_ssize_t endpos,
5457 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005460 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
5463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5465 goto onError;
5466 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5467 goto onError;
5468 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5469 goto onError;
5470 return;
5471 onError:
5472 Py_DECREF(*exceptionObject);
5473 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 }
5475}
5476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005477/* raises a UnicodeTranslateError */
5478static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 const Py_UNICODE *unicode, Py_ssize_t size,
5480 Py_ssize_t startpos, Py_ssize_t endpos,
5481 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482{
5483 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487}
5488
5489/* error handling callback helper:
5490 build arguments, call the callback and check the arguments,
5491 put the result into newpos and return the replacement string, which
5492 has to be freed by the caller */
5493static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 PyObject **errorHandler,
5495 const char *reason,
5496 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5497 Py_ssize_t startpos, Py_ssize_t endpos,
5498 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005500 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005502 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 PyObject *restuple;
5504 PyObject *resunicode;
5505
5506 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 }
5511
5512 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516
5517 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005522 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 Py_DECREF(restuple);
5524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 }
5526 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 &resunicode, &i_newpos)) {
5528 Py_DECREF(restuple);
5529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005531 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005533 else
5534 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005535 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5537 Py_DECREF(restuple);
5538 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 Py_INCREF(resunicode);
5541 Py_DECREF(restuple);
5542 return resunicode;
5543}
5544
5545/* Lookup the character ch in the mapping and put the result in result,
5546 which must be decrefed by the caller.
5547 Return 0 on success, -1 on error */
5548static
5549int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5550{
Christian Heimes217cfd12007-12-02 14:31:20 +00005551 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 PyObject *x;
5553
5554 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005556 x = PyObject_GetItem(mapping, w);
5557 Py_DECREF(w);
5558 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5560 /* No mapping found means: use 1:1 mapping. */
5561 PyErr_Clear();
5562 *result = NULL;
5563 return 0;
5564 } else
5565 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 }
5567 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *result = x;
5569 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005571 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 long value = PyLong_AS_LONG(x);
5573 long max = PyUnicode_GetMax();
5574 if (value < 0 || value > max) {
5575 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005576 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 Py_DECREF(x);
5578 return -1;
5579 }
5580 *result = x;
5581 return 0;
5582 }
5583 else if (PyUnicode_Check(x)) {
5584 *result = x;
5585 return 0;
5586 }
5587 else {
5588 /* wrong return value */
5589 PyErr_SetString(PyExc_TypeError,
5590 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005591 Py_DECREF(x);
5592 return -1;
5593 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594}
5595/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 if not reallocate and adjust various state variables.
5597 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598static
Walter Dörwald4894c302003-10-24 14:25:28 +00005599int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005601{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005602 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005603 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 /* remember old output position */
5605 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5606 /* exponentially overallocate to minimize reallocations */
5607 if (requiredsize < 2 * oldsize)
5608 requiredsize = 2 * oldsize;
5609 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5610 return -1;
5611 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 }
5613 return 0;
5614}
5615/* lookup the character, put the result in the output string and adjust
5616 various state variables. Return a new reference to the object that
5617 was put in the output buffer in *result, or Py_None, if the mapping was
5618 undefined (in which case no character was written).
5619 The called must decref result.
5620 Return 0 on success, -1 on error. */
5621static
Walter Dörwald4894c302003-10-24 14:25:28 +00005622int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5624 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625{
Walter Dörwald4894c302003-10-24 14:25:28 +00005626 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 /* not found => default to 1:1 mapping */
5630 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631 }
5632 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005634 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 /* no overflow check, because we know that the space is enough */
5636 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 }
5638 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5640 if (repsize==1) {
5641 /* no overflow check, because we know that the space is enough */
5642 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5643 }
5644 else if (repsize!=0) {
5645 /* more than one character */
5646 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5647 (insize - (curinp-startinp)) +
5648 repsize - 1;
5649 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5650 return -1;
5651 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5652 *outp += repsize;
5653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 }
5655 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005657 return 0;
5658}
5659
5660PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 Py_ssize_t size,
5662 PyObject *mapping,
5663 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 /* output object */
5666 PyObject *res = NULL;
5667 /* pointers to the beginning and end+1 of input */
5668 const Py_UNICODE *startp = p;
5669 const Py_UNICODE *endp = p + size;
5670 /* pointer into the output */
5671 Py_UNICODE *str;
5672 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 char *reason = "character maps to <undefined>";
5675 PyObject *errorHandler = NULL;
5676 PyObject *exc = NULL;
5677 /* the following variable is used for caching string comparisons
5678 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5679 * 3=ignore, 4=xmlcharrefreplace */
5680 int known_errorHandler = -1;
5681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 PyErr_BadArgument();
5684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686
5687 /* allocate enough for a simple 1:1 translation without
5688 replacements, if we need more, we'll resize */
5689 res = PyUnicode_FromUnicode(NULL, size);
5690 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 /* try to encode it */
5698 PyObject *x = NULL;
5699 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5700 Py_XDECREF(x);
5701 goto onError;
5702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005703 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 if (x!=Py_None) /* it worked => adjust input pointer */
5705 ++p;
5706 else { /* untranslatable character */
5707 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5708 Py_ssize_t repsize;
5709 Py_ssize_t newpos;
5710 Py_UNICODE *uni2;
5711 /* startpos for collecting untranslatable chars */
5712 const Py_UNICODE *collstart = p;
5713 const Py_UNICODE *collend = p+1;
5714 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* find all untranslatable characters */
5717 while (collend < endp) {
5718 if (charmaptranslate_lookup(*collend, mapping, &x))
5719 goto onError;
5720 Py_XDECREF(x);
5721 if (x!=Py_None)
5722 break;
5723 ++collend;
5724 }
5725 /* cache callback name lookup
5726 * (if not done yet, i.e. it's the first error) */
5727 if (known_errorHandler==-1) {
5728 if ((errors==NULL) || (!strcmp(errors, "strict")))
5729 known_errorHandler = 1;
5730 else if (!strcmp(errors, "replace"))
5731 known_errorHandler = 2;
5732 else if (!strcmp(errors, "ignore"))
5733 known_errorHandler = 3;
5734 else if (!strcmp(errors, "xmlcharrefreplace"))
5735 known_errorHandler = 4;
5736 else
5737 known_errorHandler = 0;
5738 }
5739 switch (known_errorHandler) {
5740 case 1: /* strict */
5741 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005742 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 case 2: /* replace */
5744 /* No need to check for space, this is a 1:1 replacement */
5745 for (coll = collstart; coll<collend; ++coll)
5746 *str++ = '?';
5747 /* fall through */
5748 case 3: /* ignore */
5749 p = collend;
5750 break;
5751 case 4: /* xmlcharrefreplace */
5752 /* generate replacement (temporarily (mis)uses p) */
5753 for (p = collstart; p < collend; ++p) {
5754 char buffer[2+29+1+1];
5755 char *cp;
5756 sprintf(buffer, "&#%d;", (int)*p);
5757 if (charmaptranslate_makespace(&res, &str,
5758 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5759 goto onError;
5760 for (cp = buffer; *cp; ++cp)
5761 *str++ = *cp;
5762 }
5763 p = collend;
5764 break;
5765 default:
5766 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5767 reason, startp, size, &exc,
5768 collstart-startp, collend-startp, &newpos);
5769 if (repunicode == NULL)
5770 goto onError;
5771 /* generate replacement */
5772 repsize = PyUnicode_GET_SIZE(repunicode);
5773 if (charmaptranslate_makespace(&res, &str,
5774 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5775 Py_DECREF(repunicode);
5776 goto onError;
5777 }
5778 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5779 *str++ = *uni2;
5780 p = startp + newpos;
5781 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005783 }
5784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 /* Resize if we allocated to much */
5786 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005787 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 if (PyUnicode_Resize(&res, respos) < 0)
5789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 }
5791 Py_XDECREF(exc);
5792 Py_XDECREF(errorHandler);
5793 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_XDECREF(res);
5797 Py_XDECREF(exc);
5798 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 return NULL;
5800}
5801
5802PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 PyObject *mapping,
5804 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805{
5806 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005807
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 str = PyUnicode_FromObject(str);
5809 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 PyUnicode_GET_SIZE(str),
5813 mapping,
5814 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 Py_DECREF(str);
5816 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 Py_XDECREF(str);
5820 return NULL;
5821}
Tim Petersced69f82003-09-16 20:30:58 +00005822
Guido van Rossum9e896b32000-04-05 20:11:21 +00005823/* --- Decimal Encoder ---------------------------------------------------- */
5824
5825int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 Py_ssize_t length,
5827 char *output,
5828 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005829{
5830 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 PyObject *errorHandler = NULL;
5832 PyObject *exc = NULL;
5833 const char *encoding = "decimal";
5834 const char *reason = "invalid decimal Unicode string";
5835 /* the following variable is used for caching string comparisons
5836 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5837 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005838
5839 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 PyErr_BadArgument();
5841 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005842 }
5843
5844 p = s;
5845 end = s + length;
5846 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 register Py_UNICODE ch = *p;
5848 int decimal;
5849 PyObject *repunicode;
5850 Py_ssize_t repsize;
5851 Py_ssize_t newpos;
5852 Py_UNICODE *uni2;
5853 Py_UNICODE *collstart;
5854 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005855
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005857 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 ++p;
5859 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005860 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 decimal = Py_UNICODE_TODECIMAL(ch);
5862 if (decimal >= 0) {
5863 *output++ = '0' + decimal;
5864 ++p;
5865 continue;
5866 }
5867 if (0 < ch && ch < 256) {
5868 *output++ = (char)ch;
5869 ++p;
5870 continue;
5871 }
5872 /* All other characters are considered unencodable */
5873 collstart = p;
5874 collend = p+1;
5875 while (collend < end) {
5876 if ((0 < *collend && *collend < 256) ||
5877 !Py_UNICODE_ISSPACE(*collend) ||
5878 Py_UNICODE_TODECIMAL(*collend))
5879 break;
5880 }
5881 /* cache callback name lookup
5882 * (if not done yet, i.e. it's the first error) */
5883 if (known_errorHandler==-1) {
5884 if ((errors==NULL) || (!strcmp(errors, "strict")))
5885 known_errorHandler = 1;
5886 else if (!strcmp(errors, "replace"))
5887 known_errorHandler = 2;
5888 else if (!strcmp(errors, "ignore"))
5889 known_errorHandler = 3;
5890 else if (!strcmp(errors, "xmlcharrefreplace"))
5891 known_errorHandler = 4;
5892 else
5893 known_errorHandler = 0;
5894 }
5895 switch (known_errorHandler) {
5896 case 1: /* strict */
5897 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5898 goto onError;
5899 case 2: /* replace */
5900 for (p = collstart; p < collend; ++p)
5901 *output++ = '?';
5902 /* fall through */
5903 case 3: /* ignore */
5904 p = collend;
5905 break;
5906 case 4: /* xmlcharrefreplace */
5907 /* generate replacement (temporarily (mis)uses p) */
5908 for (p = collstart; p < collend; ++p)
5909 output += sprintf(output, "&#%d;", (int)*p);
5910 p = collend;
5911 break;
5912 default:
5913 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5914 encoding, reason, s, length, &exc,
5915 collstart-s, collend-s, &newpos);
5916 if (repunicode == NULL)
5917 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005918 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005919 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005920 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5921 Py_DECREF(repunicode);
5922 goto onError;
5923 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 /* generate replacement */
5925 repsize = PyUnicode_GET_SIZE(repunicode);
5926 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5927 Py_UNICODE ch = *uni2;
5928 if (Py_UNICODE_ISSPACE(ch))
5929 *output++ = ' ';
5930 else {
5931 decimal = Py_UNICODE_TODECIMAL(ch);
5932 if (decimal >= 0)
5933 *output++ = '0' + decimal;
5934 else if (0 < ch && ch < 256)
5935 *output++ = (char)ch;
5936 else {
5937 Py_DECREF(repunicode);
5938 raise_encode_exception(&exc, encoding,
5939 s, length, collstart-s, collend-s, reason);
5940 goto onError;
5941 }
5942 }
5943 }
5944 p = s + newpos;
5945 Py_DECREF(repunicode);
5946 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005947 }
5948 /* 0-terminate the output string */
5949 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 Py_XDECREF(exc);
5951 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005952 return 0;
5953
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955 Py_XDECREF(exc);
5956 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005957 return -1;
5958}
5959
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960/* --- Helpers ------------------------------------------------------------ */
5961
Eric Smith8c663262007-08-25 02:26:07 +00005962#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005963#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005964
Thomas Wouters477c8d52006-05-27 19:21:47 +00005965#include "stringlib/count.h"
5966#include "stringlib/find.h"
5967#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005968#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005969
Eric Smith5807c412008-05-11 21:00:57 +00005970#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005971#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005972#include "stringlib/localeutil.h"
5973
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005975#define ADJUST_INDICES(start, end, len) \
5976 if (end > len) \
5977 end = len; \
5978 else if (end < 0) { \
5979 end += len; \
5980 if (end < 0) \
5981 end = 0; \
5982 } \
5983 if (start < 0) { \
5984 start += len; \
5985 if (start < 0) \
5986 start = 0; \
5987 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988
Martin v. Löwis18e16552006-02-15 17:27:45 +00005989Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 PyObject *substr,
5991 Py_ssize_t start,
5992 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005994 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005995 PyUnicodeObject* str_obj;
5996 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005997
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5999 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006001 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6002 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 Py_DECREF(str_obj);
6004 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Tim Petersced69f82003-09-16 20:30:58 +00006006
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006007 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006008 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006009 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6010 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 );
6012
6013 Py_DECREF(sub_obj);
6014 Py_DECREF(str_obj);
6015
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 return result;
6017}
6018
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006020 PyObject *sub,
6021 Py_ssize_t start,
6022 Py_ssize_t end,
6023 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006028 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006030 sub = PyUnicode_FromObject(sub);
6031 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 Py_DECREF(str);
6033 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 }
Tim Petersced69f82003-09-16 20:30:58 +00006035
Thomas Wouters477c8d52006-05-27 19:21:47 +00006036 if (direction > 0)
6037 result = stringlib_find_slice(
6038 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6039 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6040 start, end
6041 );
6042 else
6043 result = stringlib_rfind_slice(
6044 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6045 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6046 start, end
6047 );
6048
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006050 Py_DECREF(sub);
6051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return result;
6053}
6054
Tim Petersced69f82003-09-16 20:30:58 +00006055static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 PyUnicodeObject *substring,
6058 Py_ssize_t start,
6059 Py_ssize_t end,
6060 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 if (substring->length == 0)
6063 return 1;
6064
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006065 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 end -= substring->length;
6067 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
6070 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 if (Py_UNICODE_MATCH(self, end, substring))
6072 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 } else {
6074 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
6077
6078 return 0;
6079}
6080
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 PyObject *substr,
6083 Py_ssize_t start,
6084 Py_ssize_t end,
6085 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 str = PyUnicode_FromObject(str);
6090 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 substr = PyUnicode_FromObject(substr);
6093 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 Py_DECREF(str);
6095 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Tim Petersced69f82003-09-16 20:30:58 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 (PyUnicodeObject *)substr,
6100 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 Py_DECREF(str);
6102 Py_DECREF(substr);
6103 return result;
6104}
6105
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106/* Apply fixfct filter to the Unicode object self and return a
6107 reference to the modified object */
6108
Tim Petersced69f82003-09-16 20:30:58 +00006109static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112{
6113
6114 PyUnicodeObject *u;
6115
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006116 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006119
6120 Py_UNICODE_COPY(u->str, self->str, self->length);
6121
Tim Peters7a29bd52001-09-12 03:03:31 +00006122 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 /* fixfct should return TRUE if it modified the buffer. If
6124 FALSE, return a reference to the original buffer instead
6125 (to save space, not time) */
6126 Py_INCREF(self);
6127 Py_DECREF(u);
6128 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 }
6130 return (PyObject*) u;
6131}
6132
Tim Petersced69f82003-09-16 20:30:58 +00006133static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134int fixupper(PyUnicodeObject *self)
6135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006136 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 Py_UNICODE *s = self->str;
6138 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006139
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006142
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 ch = Py_UNICODE_TOUPPER(*s);
6144 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 *s = ch;
6147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 s++;
6149 }
6150
6151 return status;
6152}
6153
Tim Petersced69f82003-09-16 20:30:58 +00006154static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155int fixlower(PyUnicodeObject *self)
6156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 Py_UNICODE *s = self->str;
6159 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 ch = Py_UNICODE_TOLOWER(*s);
6165 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 *s = ch;
6168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 s++;
6170 }
6171
6172 return status;
6173}
6174
Tim Petersced69f82003-09-16 20:30:58 +00006175static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176int fixswapcase(PyUnicodeObject *self)
6177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 Py_UNICODE *s = self->str;
6180 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006181
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 while (len-- > 0) {
6183 if (Py_UNICODE_ISUPPER(*s)) {
6184 *s = Py_UNICODE_TOLOWER(*s);
6185 status = 1;
6186 } else if (Py_UNICODE_ISLOWER(*s)) {
6187 *s = Py_UNICODE_TOUPPER(*s);
6188 status = 1;
6189 }
6190 s++;
6191 }
6192
6193 return status;
6194}
6195
Tim Petersced69f82003-09-16 20:30:58 +00006196static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197int fixcapitalize(PyUnicodeObject *self)
6198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006199 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006200 Py_UNICODE *s = self->str;
6201 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006202
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006203 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006205 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 *s = Py_UNICODE_TOUPPER(*s);
6207 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006209 s++;
6210 while (--len > 0) {
6211 if (Py_UNICODE_ISUPPER(*s)) {
6212 *s = Py_UNICODE_TOLOWER(*s);
6213 status = 1;
6214 }
6215 s++;
6216 }
6217 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
6220static
6221int fixtitle(PyUnicodeObject *self)
6222{
6223 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6224 register Py_UNICODE *e;
6225 int previous_is_cased;
6226
6227 /* Shortcut for single character strings */
6228 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6230 if (*p != ch) {
6231 *p = ch;
6232 return 1;
6233 }
6234 else
6235 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 }
Tim Petersced69f82003-09-16 20:30:58 +00006237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 e = p + PyUnicode_GET_SIZE(self);
6239 previous_is_cased = 0;
6240 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006242
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 if (previous_is_cased)
6244 *p = Py_UNICODE_TOLOWER(ch);
6245 else
6246 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006247
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 if (Py_UNICODE_ISLOWER(ch) ||
6249 Py_UNICODE_ISUPPER(ch) ||
6250 Py_UNICODE_ISTITLE(ch))
6251 previous_is_cased = 1;
6252 else
6253 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 }
6255 return 1;
6256}
6257
Tim Peters8ce9f162004-08-27 01:49:32 +00006258PyObject *
6259PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Skip Montanaro6543b452004-09-16 03:28:13 +00006261 const Py_UNICODE blank = ' ';
6262 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006263 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006264 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006265 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6266 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006267 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6268 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006269 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006270 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
Tim Peters05eba1f2004-08-27 21:32:02 +00006272 fseq = PySequence_Fast(seq, "");
6273 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006275 }
6276
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006277 /* NOTE: the following code can't call back into Python code,
6278 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006279 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006280
Tim Peters05eba1f2004-08-27 21:32:02 +00006281 seqlen = PySequence_Fast_GET_SIZE(fseq);
6282 /* If empty sequence, return u"". */
6283 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006284 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6285 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006286 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006287 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006288 /* If singleton sequence with an exact Unicode, return that. */
6289 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 item = items[0];
6291 if (PyUnicode_CheckExact(item)) {
6292 Py_INCREF(item);
6293 res = (PyUnicodeObject *)item;
6294 goto Done;
6295 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006296 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006297 else {
6298 /* Set up sep and seplen */
6299 if (separator == NULL) {
6300 sep = &blank;
6301 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006302 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006303 else {
6304 if (!PyUnicode_Check(separator)) {
6305 PyErr_Format(PyExc_TypeError,
6306 "separator: expected str instance,"
6307 " %.80s found",
6308 Py_TYPE(separator)->tp_name);
6309 goto onError;
6310 }
6311 sep = PyUnicode_AS_UNICODE(separator);
6312 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006313 }
6314 }
6315
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006316 /* There are at least two things to join, or else we have a subclass
6317 * of str in the sequence.
6318 * Do a pre-pass to figure out the total amount of space we'll
6319 * need (sz), and see whether all argument are strings.
6320 */
6321 sz = 0;
6322 for (i = 0; i < seqlen; i++) {
6323 const Py_ssize_t old_sz = sz;
6324 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 if (!PyUnicode_Check(item)) {
6326 PyErr_Format(PyExc_TypeError,
6327 "sequence item %zd: expected str instance,"
6328 " %.80s found",
6329 i, Py_TYPE(item)->tp_name);
6330 goto onError;
6331 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006332 sz += PyUnicode_GET_SIZE(item);
6333 if (i != 0)
6334 sz += seplen;
6335 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6336 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006338 goto onError;
6339 }
6340 }
Tim Petersced69f82003-09-16 20:30:58 +00006341
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006342 res = _PyUnicode_New(sz);
6343 if (res == NULL)
6344 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006345
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006346 /* Catenate everything. */
6347 res_p = PyUnicode_AS_UNICODE(res);
6348 for (i = 0; i < seqlen; ++i) {
6349 Py_ssize_t itemlen;
6350 item = items[i];
6351 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 /* Copy item, and maybe the separator. */
6353 if (i) {
6354 Py_UNICODE_COPY(res_p, sep, seplen);
6355 res_p += seplen;
6356 }
6357 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6358 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006359 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006360
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006362 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 return (PyObject *)res;
6364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006366 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006367 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
6369}
6370
Tim Petersced69f82003-09-16 20:30:58 +00006371static
6372PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 Py_ssize_t left,
6374 Py_ssize_t right,
6375 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376{
6377 PyUnicodeObject *u;
6378
6379 if (left < 0)
6380 left = 0;
6381 if (right < 0)
6382 right = 0;
6383
Tim Peters7a29bd52001-09-12 03:03:31 +00006384 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 Py_INCREF(self);
6386 return self;
6387 }
6388
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006389 if (left > PY_SSIZE_T_MAX - self->length ||
6390 right > PY_SSIZE_T_MAX - (left + self->length)) {
6391 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6392 return NULL;
6393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 u = _PyUnicode_New(left + self->length + right);
6395 if (u) {
6396 if (left)
6397 Py_UNICODE_FILL(u->str, fill, left);
6398 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6399 if (right)
6400 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6401 }
6402
6403 return u;
6404}
6405
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006406PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410 string = PyUnicode_FromObject(string);
6411 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006414 list = stringlib_splitlines(
6415 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6416 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
6418 Py_DECREF(string);
6419 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420}
6421
Tim Petersced69f82003-09-16 20:30:58 +00006422static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 PyUnicodeObject *substring,
6425 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006428 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006431 return stringlib_split_whitespace(
6432 (PyObject*) self, self->str, self->length, maxcount
6433 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006435 return stringlib_split(
6436 (PyObject*) self, self->str, self->length,
6437 substring->str, substring->length,
6438 maxcount
6439 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Tim Petersced69f82003-09-16 20:30:58 +00006442static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 PyUnicodeObject *substring,
6445 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006447 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006448 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006449
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006450 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006451 return stringlib_rsplit_whitespace(
6452 (PyObject*) self, self->str, self->length, maxcount
6453 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006454
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006455 return stringlib_rsplit(
6456 (PyObject*) self, self->str, self->length,
6457 substring->str, substring->length,
6458 maxcount
6459 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006460}
6461
6462static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 PyUnicodeObject *str1,
6465 PyUnicodeObject *str2,
6466 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467{
6468 PyUnicodeObject *u;
6469
6470 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006472 else if (maxcount == 0 || self->length == 0)
6473 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006476 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006478 if (str1->length == 0)
6479 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 if (str1->length == 1) {
6481 /* replace characters */
6482 Py_UNICODE u1, u2;
6483 if (!findchar(self->str, self->length, str1->str[0]))
6484 goto nothing;
6485 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6486 if (!u)
6487 return NULL;
6488 Py_UNICODE_COPY(u->str, self->str, self->length);
6489 u1 = str1->str[0];
6490 u2 = str2->str[0];
6491 for (i = 0; i < u->length; i++)
6492 if (u->str[i] == u1) {
6493 if (--maxcount < 0)
6494 break;
6495 u->str[i] = u2;
6496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006498 i = stringlib_find(
6499 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006501 if (i < 0)
6502 goto nothing;
6503 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6504 if (!u)
6505 return NULL;
6506 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006507
6508 /* change everything in-place, starting with this one */
6509 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6510 i += str1->length;
6511
6512 while ( --maxcount > 0) {
6513 i = stringlib_find(self->str+i, self->length-i,
6514 str1->str, str1->length,
6515 i);
6516 if (i == -1)
6517 break;
6518 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6519 i += str1->length;
6520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523
6524 Py_ssize_t n, i, j, e;
6525 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 Py_UNICODE *p;
6527
6528 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006529 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6530 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006531 if (n == 0)
6532 goto nothing;
6533 /* new_size = self->length + n * (str2->length - str1->length)); */
6534 delta = (str2->length - str1->length);
6535 if (delta == 0) {
6536 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006538 product = n * (str2->length - str1->length);
6539 if ((product / (str2->length - str1->length)) != n) {
6540 PyErr_SetString(PyExc_OverflowError,
6541 "replace string is too long");
6542 return NULL;
6543 }
6544 new_size = self->length + product;
6545 if (new_size < 0) {
6546 PyErr_SetString(PyExc_OverflowError,
6547 "replace string is too long");
6548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
6550 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006551 u = _PyUnicode_New(new_size);
6552 if (!u)
6553 return NULL;
6554 i = 0;
6555 p = u->str;
6556 e = self->length - str1->length;
6557 if (str1->length > 0) {
6558 while (n-- > 0) {
6559 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006560 j = stringlib_find(self->str+i, self->length-i,
6561 str1->str, str1->length,
6562 i);
6563 if (j == -1)
6564 break;
6565 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 /* copy unchanged part [i:j] */
6567 Py_UNICODE_COPY(p, self->str+i, j-i);
6568 p += j - i;
6569 }
6570 /* copy substitution string */
6571 if (str2->length > 0) {
6572 Py_UNICODE_COPY(p, str2->str, str2->length);
6573 p += str2->length;
6574 }
6575 i = j + str1->length;
6576 }
6577 if (i < self->length)
6578 /* copy tail [i:] */
6579 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6580 } else {
6581 /* interleave */
6582 while (n > 0) {
6583 Py_UNICODE_COPY(p, str2->str, str2->length);
6584 p += str2->length;
6585 if (--n <= 0)
6586 break;
6587 *p++ = self->str[i++];
6588 }
6589 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006593
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 /* nothing to replace; return original string (when possible) */
6596 if (PyUnicode_CheckExact(self)) {
6597 Py_INCREF(self);
6598 return (PyObject *) self;
6599 }
6600 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601}
6602
6603/* --- Unicode Object Methods --------------------------------------------- */
6604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006605PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607\n\
6608Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006609characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006612unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 return fixup(self, fixtitle);
6615}
6616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619\n\
6620Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006624unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 return fixup(self, fixcapitalize);
6627}
6628
6629#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006630PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632\n\
6633Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
6636static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006637unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 PyObject *list;
6640 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006641 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 /* Split into words */
6644 list = split(self, NULL, -1);
6645 if (!list)
6646 return NULL;
6647
6648 /* Capitalize each word */
6649 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6650 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 if (item == NULL)
6653 goto onError;
6654 Py_DECREF(PyList_GET_ITEM(list, i));
6655 PyList_SET_ITEM(list, i, item);
6656 }
6657
6658 /* Join the words to form a new string */
6659 item = PyUnicode_Join(NULL, list);
6660
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 Py_DECREF(list);
6663 return (PyObject *)item;
6664}
6665#endif
6666
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006667/* Argument converter. Coerces to a single unicode character */
6668
6669static int
6670convert_uc(PyObject *obj, void *addr)
6671{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006672 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6673 PyObject *uniobj;
6674 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006675
Benjamin Peterson14339b62009-01-31 16:36:08 +00006676 uniobj = PyUnicode_FromObject(obj);
6677 if (uniobj == NULL) {
6678 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006680 return 0;
6681 }
6682 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6683 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006685 Py_DECREF(uniobj);
6686 return 0;
6687 }
6688 unistr = PyUnicode_AS_UNICODE(uniobj);
6689 *fillcharloc = unistr[0];
6690 Py_DECREF(uniobj);
6691 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006692}
6693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006694PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006697Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006698done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700static PyObject *
6701unicode_center(PyUnicodeObject *self, PyObject *args)
6702{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006703 Py_ssize_t marg, left;
6704 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006705 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706
Thomas Woutersde017742006-02-16 19:34:37 +00006707 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 return NULL;
6709
Tim Peters7a29bd52001-09-12 03:03:31 +00006710 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 Py_INCREF(self);
6712 return (PyObject*) self;
6713 }
6714
6715 marg = width - self->length;
6716 left = marg / 2 + (marg & width & 1);
6717
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006718 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
Marc-André Lemburge5034372000-08-08 08:04:29 +00006721#if 0
6722
6723/* This code should go into some future Unicode collation support
6724 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006725 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006726
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006727/* speedy UTF-16 code point order comparison */
6728/* gleaned from: */
6729/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6730
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006731static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006732{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006733 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006734 0, 0, 0, 0, 0, 0, 0, 0,
6735 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006736 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006737};
6738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739static int
6740unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006743
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 Py_UNICODE *s1 = str1->str;
6745 Py_UNICODE *s2 = str2->str;
6746
6747 len1 = str1->length;
6748 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006749
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006751 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006752
6753 c1 = *s1++;
6754 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 if (c1 > (1<<11) * 26)
6757 c1 += utf16Fixup[c1>>11];
6758 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006759 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006760 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006761
6762 if (c1 != c2)
6763 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006764
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006765 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 }
6767
6768 return (len1 < len2) ? -1 : (len1 != len2);
6769}
6770
Marc-André Lemburge5034372000-08-08 08:04:29 +00006771#else
6772
6773static int
6774unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6775{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006776 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006777
6778 Py_UNICODE *s1 = str1->str;
6779 Py_UNICODE *s2 = str2->str;
6780
6781 len1 = str1->length;
6782 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006783
Marc-André Lemburge5034372000-08-08 08:04:29 +00006784 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006785 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006786
Fredrik Lundh45714e92001-06-26 16:39:36 +00006787 c1 = *s1++;
6788 c2 = *s2++;
6789
6790 if (c1 != c2)
6791 return (c1 < c2) ? -1 : 1;
6792
Marc-André Lemburge5034372000-08-08 08:04:29 +00006793 len1--; len2--;
6794 }
6795
6796 return (len1 < len2) ? -1 : (len1 != len2);
6797}
6798
6799#endif
6800
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006804 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6805 return unicode_compare((PyUnicodeObject *)left,
6806 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006807 PyErr_Format(PyExc_TypeError,
6808 "Can't compare %.100s and %.100s",
6809 left->ob_type->tp_name,
6810 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 return -1;
6812}
6813
Martin v. Löwis5b222132007-06-10 09:51:05 +00006814int
6815PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6816{
6817 int i;
6818 Py_UNICODE *id;
6819 assert(PyUnicode_Check(uni));
6820 id = PyUnicode_AS_UNICODE(uni);
6821 /* Compare Unicode string and source character set string */
6822 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 if (id[i] != str[i])
6824 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006825 /* This check keeps Python strings that end in '\0' from comparing equal
6826 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006827 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006829 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006831 return 0;
6832}
6833
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006834
Benjamin Peterson29060642009-01-31 22:14:21 +00006835#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006837
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006838PyObject *PyUnicode_RichCompare(PyObject *left,
6839 PyObject *right,
6840 int op)
6841{
6842 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006843
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006844 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6845 PyObject *v;
6846 if (((PyUnicodeObject *) left)->length !=
6847 ((PyUnicodeObject *) right)->length) {
6848 if (op == Py_EQ) {
6849 Py_INCREF(Py_False);
6850 return Py_False;
6851 }
6852 if (op == Py_NE) {
6853 Py_INCREF(Py_True);
6854 return Py_True;
6855 }
6856 }
6857 if (left == right)
6858 result = 0;
6859 else
6860 result = unicode_compare((PyUnicodeObject *)left,
6861 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006863 /* Convert the return value to a Boolean */
6864 switch (op) {
6865 case Py_EQ:
6866 v = TEST_COND(result == 0);
6867 break;
6868 case Py_NE:
6869 v = TEST_COND(result != 0);
6870 break;
6871 case Py_LE:
6872 v = TEST_COND(result <= 0);
6873 break;
6874 case Py_GE:
6875 v = TEST_COND(result >= 0);
6876 break;
6877 case Py_LT:
6878 v = TEST_COND(result == -1);
6879 break;
6880 case Py_GT:
6881 v = TEST_COND(result == 1);
6882 break;
6883 default:
6884 PyErr_BadArgument();
6885 return NULL;
6886 }
6887 Py_INCREF(v);
6888 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006890
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006891 Py_INCREF(Py_NotImplemented);
6892 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006893}
6894
Guido van Rossum403d68b2000-03-13 15:55:09 +00006895int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006897{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006898 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006900
6901 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006902 sub = PyUnicode_FromObject(element);
6903 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 PyErr_Format(PyExc_TypeError,
6905 "'in <string>' requires string as left operand, not %s",
6906 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006907 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006908 }
6909
Thomas Wouters477c8d52006-05-27 19:21:47 +00006910 str = PyUnicode_FromObject(container);
6911 if (!str) {
6912 Py_DECREF(sub);
6913 return -1;
6914 }
6915
6916 result = stringlib_contains_obj(str, sub);
6917
6918 Py_DECREF(str);
6919 Py_DECREF(sub);
6920
Guido van Rossum403d68b2000-03-13 15:55:09 +00006921 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006922}
6923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924/* Concat to string or Unicode object giving a new Unicode object. */
6925
6926PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
6929 PyUnicodeObject *u = NULL, *v = NULL, *w;
6930
6931 /* Coerce the two arguments */
6932 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6933 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6936 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939 /* Shortcuts */
6940 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 Py_DECREF(v);
6942 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 }
6944 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 Py_DECREF(u);
6946 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 }
6948
6949 /* Concat the two Unicode strings */
6950 w = _PyUnicode_New(u->length + v->length);
6951 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 Py_UNICODE_COPY(w->str, u->str, u->length);
6954 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6955
6956 Py_DECREF(u);
6957 Py_DECREF(v);
6958 return (PyObject *)w;
6959
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 Py_XDECREF(u);
6962 Py_XDECREF(v);
6963 return NULL;
6964}
6965
Walter Dörwald1ab83302007-05-18 17:15:44 +00006966void
6967PyUnicode_Append(PyObject **pleft, PyObject *right)
6968{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006969 PyObject *new;
6970 if (*pleft == NULL)
6971 return;
6972 if (right == NULL || !PyUnicode_Check(*pleft)) {
6973 Py_DECREF(*pleft);
6974 *pleft = NULL;
6975 return;
6976 }
6977 new = PyUnicode_Concat(*pleft, right);
6978 Py_DECREF(*pleft);
6979 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006980}
6981
6982void
6983PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6984{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 PyUnicode_Append(pleft, right);
6986 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006987}
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006993string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995
6996static PyObject *
6997unicode_count(PyUnicodeObject *self, PyObject *args)
6998{
6999 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007000 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007001 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 PyObject *result;
7003
Guido van Rossumb8872e62000-05-09 14:14:27 +00007004 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 return NULL;
7007
7008 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007012
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007013 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007014 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007015 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007016 substring->str, substring->length,
7017 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019
7020 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return result;
7023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007028Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007029to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007030handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007031a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7032'xmlcharrefreplace' as well as any other name registered with\n\
7033codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
7035static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007036unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007038 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 char *encoding = NULL;
7040 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007041 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007042
Benjamin Peterson308d6372009-09-18 21:42:35 +00007043 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7044 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007046 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007047 if (v == NULL)
7048 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007049 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007050 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007051 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007052 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007053 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007054 Py_DECREF(v);
7055 return NULL;
7056 }
7057 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007058
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007060 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007061}
7062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065\n\
7066Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068
7069static PyObject*
7070unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7071{
7072 Py_UNICODE *e;
7073 Py_UNICODE *p;
7074 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007075 Py_UNICODE *qe;
7076 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 PyUnicodeObject *u;
7078 int tabsize = 8;
7079
7080 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
Thomas Wouters7e474022000-07-16 12:04:32 +00007083 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007084 i = 0; /* chars up to and including most recent \n or \r */
7085 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7086 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 for (p = self->str; p < e; p++)
7088 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 if (tabsize > 0) {
7090 incr = tabsize - (j % tabsize); /* cannot overflow */
7091 if (j > PY_SSIZE_T_MAX - incr)
7092 goto overflow1;
7093 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 if (j > PY_SSIZE_T_MAX - 1)
7098 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 j++;
7100 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 if (i > PY_SSIZE_T_MAX - j)
7102 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007104 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 }
7106 }
7107
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007108 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007110
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 /* Second pass: create output string and fill it */
7112 u = _PyUnicode_New(i + j);
7113 if (!u)
7114 return NULL;
7115
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007116 j = 0; /* same as in first pass */
7117 q = u->str; /* next output char */
7118 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120 for (p = self->str; p < e; p++)
7121 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 if (tabsize > 0) {
7123 i = tabsize - (j % tabsize);
7124 j += i;
7125 while (i--) {
7126 if (q >= qe)
7127 goto overflow2;
7128 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 else {
7133 if (q >= qe)
7134 goto overflow2;
7135 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007136 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 if (*p == '\n' || *p == '\r')
7138 j = 0;
7139 }
7140
7141 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007142
7143 overflow2:
7144 Py_DECREF(u);
7145 overflow1:
7146 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148}
7149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152\n\
7153Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007154such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155arguments start and end are interpreted as in slice notation.\n\
7156\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007157Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
7159static PyObject *
7160unicode_find(PyUnicodeObject *self, PyObject *args)
7161{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007162 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007163 Py_ssize_t start;
7164 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007165 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
Christian Heimes9cd17752007-11-18 19:35:23 +00007167 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
Thomas Wouters477c8d52006-05-27 19:21:47 +00007170 result = stringlib_find_slice(
7171 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7172 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7173 start, end
7174 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175
7176 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007177
Christian Heimes217cfd12007-12-02 14:31:20 +00007178 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179}
7180
7181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183{
7184 if (index < 0 || index >= self->length) {
7185 PyErr_SetString(PyExc_IndexError, "string index out of range");
7186 return NULL;
7187 }
7188
7189 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7190}
7191
Guido van Rossumc2504932007-09-18 19:42:40 +00007192/* Believe it or not, this produces the same value for ASCII strings
7193 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007195unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196{
Guido van Rossumc2504932007-09-18 19:42:40 +00007197 Py_ssize_t len;
7198 Py_UNICODE *p;
7199 long x;
7200
7201 if (self->hash != -1)
7202 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007203 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007204 p = self->str;
7205 x = *p << 7;
7206 while (--len >= 0)
7207 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007208 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007209 if (x == -1)
7210 x = -2;
7211 self->hash = x;
7212 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213}
7214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007215PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007218Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
7220static PyObject *
7221unicode_index(PyUnicodeObject *self, PyObject *args)
7222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007223 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007224 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007225 Py_ssize_t start;
7226 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
Christian Heimes9cd17752007-11-18 19:35:23 +00007228 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
Thomas Wouters477c8d52006-05-27 19:21:47 +00007231 result = stringlib_find_slice(
7232 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7233 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7234 start, end
7235 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007238
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 if (result < 0) {
7240 PyErr_SetString(PyExc_ValueError, "substring not found");
7241 return NULL;
7242 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007243
Christian Heimes217cfd12007-12-02 14:31:20 +00007244 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245}
7246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007247PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007250Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007251at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007254unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
7256 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7257 register const Py_UNICODE *e;
7258 int cased;
7259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 /* Shortcut for single character strings */
7261 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007264 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007265 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 e = p + PyUnicode_GET_SIZE(self);
7269 cased = 0;
7270 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007272
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7274 return PyBool_FromLong(0);
7275 else if (!cased && Py_UNICODE_ISLOWER(ch))
7276 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007278 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279}
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007284Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007285at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286
7287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007288unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289{
7290 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7291 register const Py_UNICODE *e;
7292 int cased;
7293
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 /* Shortcut for single character strings */
7295 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007298 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007299 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 e = p + PyUnicode_GET_SIZE(self);
7303 cased = 0;
7304 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007306
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7308 return PyBool_FromLong(0);
7309 else if (!cased && Py_UNICODE_ISUPPER(ch))
7310 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007312 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313}
7314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007315PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007318Return True if S is a titlecased string and there is at least one\n\
7319character in S, i.e. upper- and titlecase characters may only\n\
7320follow uncased characters and lowercase characters only cased ones.\n\
7321Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
7323static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007324unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7327 register const Py_UNICODE *e;
7328 int cased, previous_is_cased;
7329
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 /* Shortcut for single character strings */
7331 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7333 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007335 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007336 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007338
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 e = p + PyUnicode_GET_SIZE(self);
7340 cased = 0;
7341 previous_is_cased = 0;
7342 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007344
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7346 if (previous_is_cased)
7347 return PyBool_FromLong(0);
7348 previous_is_cased = 1;
7349 cased = 1;
7350 }
7351 else if (Py_UNICODE_ISLOWER(ch)) {
7352 if (!previous_is_cased)
7353 return PyBool_FromLong(0);
7354 previous_is_cased = 1;
7355 cased = 1;
7356 }
7357 else
7358 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007360 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361}
7362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007363PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007366Return True if all characters in S are whitespace\n\
7367and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368
7369static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007370unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371{
7372 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7373 register const Py_UNICODE *e;
7374
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 /* Shortcut for single character strings */
7376 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 Py_UNICODE_ISSPACE(*p))
7378 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007380 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007381 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007383
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 e = p + PyUnicode_GET_SIZE(self);
7385 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 if (!Py_UNICODE_ISSPACE(*p))
7387 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007389 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007394\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007395Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007397
7398static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007399unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007400{
7401 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7402 register const Py_UNICODE *e;
7403
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007404 /* Shortcut for single character strings */
7405 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 Py_UNICODE_ISALPHA(*p))
7407 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408
7409 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007410 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007412
7413 e = p + PyUnicode_GET_SIZE(self);
7414 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 if (!Py_UNICODE_ISALPHA(*p))
7416 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007417 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007418 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007419}
7420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007421PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007423\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007424Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007425and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007426
7427static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007428unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007429{
7430 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7431 register const Py_UNICODE *e;
7432
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007433 /* Shortcut for single character strings */
7434 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 Py_UNICODE_ISALNUM(*p))
7436 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007437
7438 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007439 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007441
7442 e = p + PyUnicode_GET_SIZE(self);
7443 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 if (!Py_UNICODE_ISALNUM(*p))
7445 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007446 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007447 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007448}
7449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007450PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007453Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007457unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458{
7459 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7460 register const Py_UNICODE *e;
7461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 /* Shortcut for single character strings */
7463 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 Py_UNICODE_ISDECIMAL(*p))
7465 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007467 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007468 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007470
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 e = p + PyUnicode_GET_SIZE(self);
7472 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 if (!Py_UNICODE_ISDECIMAL(*p))
7474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007482Return True if all characters in S are digits\n\
7483and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
7485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007486unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487{
7488 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7489 register const Py_UNICODE *e;
7490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 /* Shortcut for single character strings */
7492 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 Py_UNICODE_ISDIGIT(*p))
7494 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007496 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007497 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 e = p + PyUnicode_GET_SIZE(self);
7501 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (!Py_UNICODE_ISDIGIT(*p))
7503 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007505 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506}
7507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007511Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007512False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
7514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516{
7517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7518 register const Py_UNICODE *e;
7519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 /* Shortcut for single character strings */
7521 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 Py_UNICODE_ISNUMERIC(*p))
7523 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007525 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007526 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007528
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 e = p + PyUnicode_GET_SIZE(self);
7530 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (!Py_UNICODE_ISNUMERIC(*p))
7532 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007534 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535}
7536
Martin v. Löwis47383402007-08-15 07:32:56 +00007537int
7538PyUnicode_IsIdentifier(PyObject *self)
7539{
7540 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7541 register const Py_UNICODE *e;
7542
7543 /* Special case for empty strings */
7544 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007546
7547 /* PEP 3131 says that the first character must be in
7548 XID_Start and subsequent characters in XID_Continue,
7549 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007550 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007551 letters, digits, underscore). However, given the current
7552 definition of XID_Start and XID_Continue, it is sufficient
7553 to check just for these, except that _ must be allowed
7554 as starting an identifier. */
7555 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7556 return 0;
7557
7558 e = p + PyUnicode_GET_SIZE(self);
7559 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (!_PyUnicode_IsXidContinue(*p))
7561 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007562 }
7563 return 1;
7564}
7565
7566PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007568\n\
7569Return True if S is a valid identifier according\n\
7570to the language definition.");
7571
7572static PyObject*
7573unicode_isidentifier(PyObject *self)
7574{
7575 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7576}
7577
Georg Brandl559e5d72008-06-11 18:37:52 +00007578PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007580\n\
7581Return True if all characters in S are considered\n\
7582printable in repr() or S is empty, False otherwise.");
7583
7584static PyObject*
7585unicode_isprintable(PyObject *self)
7586{
7587 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7588 register const Py_UNICODE *e;
7589
7590 /* Shortcut for single character strings */
7591 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7592 Py_RETURN_TRUE;
7593 }
7594
7595 e = p + PyUnicode_GET_SIZE(self);
7596 for (; p < e; p++) {
7597 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7598 Py_RETURN_FALSE;
7599 }
7600 }
7601 Py_RETURN_TRUE;
7602}
7603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007604PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007605 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606\n\
7607Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007608iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
7610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007611unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007613 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis18e16552006-02-15 17:27:45 +00007616static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617unicode_length(PyUnicodeObject *self)
7618{
7619 return self->length;
7620}
7621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007625Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007626done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject *
7629unicode_ljust(PyUnicodeObject *self, PyObject *args)
7630{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007631 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007632 Py_UNICODE fillchar = ' ';
7633
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007634 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 return NULL;
7636
Tim Peters7a29bd52001-09-12 03:03:31 +00007637 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 Py_INCREF(self);
7639 return (PyObject*) self;
7640 }
7641
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007642 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
7650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007651unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 return fixup(self, fixlower);
7654}
7655
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007656#define LEFTSTRIP 0
7657#define RIGHTSTRIP 1
7658#define BOTHSTRIP 2
7659
7660/* Arrays indexed by above */
7661static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7662
7663#define STRIPNAME(i) (stripformat[i]+3)
7664
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665/* externally visible for str.strip(unicode) */
7666PyObject *
7667_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7668{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7670 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7671 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7672 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7673 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007674
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007676
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 i = 0;
7678 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7680 i++;
7681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007682 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 j = len;
7685 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 do {
7687 j--;
7688 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7689 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007691
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 Py_INCREF(self);
7694 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695 }
7696 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007698}
7699
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007704 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7705 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007706
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 i = 0;
7708 if (striptype != RIGHTSTRIP) {
7709 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7710 i++;
7711 }
7712 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714 j = len;
7715 if (striptype != LEFTSTRIP) {
7716 do {
7717 j--;
7718 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7719 j++;
7720 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7723 Py_INCREF(self);
7724 return (PyObject*)self;
7725 }
7726 else
7727 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728}
7729
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007730
7731static PyObject *
7732do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007735
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7737 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007738
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 if (sep != NULL && sep != Py_None) {
7740 if (PyUnicode_Check(sep))
7741 return _PyUnicode_XStrip(self, striptype, sep);
7742 else {
7743 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 "%s arg must be None or str",
7745 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 return NULL;
7747 }
7748 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007749
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751}
7752
7753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007756\n\
7757Return a copy of the string S with leading and trailing\n\
7758whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007759If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007760
7761static PyObject *
7762unicode_strip(PyUnicodeObject *self, PyObject *args)
7763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 if (PyTuple_GET_SIZE(args) == 0)
7765 return do_strip(self, BOTHSTRIP); /* Common case */
7766 else
7767 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007768}
7769
7770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007771PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007773\n\
7774Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007775If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007776
7777static PyObject *
7778unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 if (PyTuple_GET_SIZE(args) == 0)
7781 return do_strip(self, LEFTSTRIP); /* Common case */
7782 else
7783 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007784}
7785
7786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007789\n\
7790Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007791If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007792
7793static PyObject *
7794unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7795{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007796 if (PyTuple_GET_SIZE(args) == 0)
7797 return do_strip(self, RIGHTSTRIP); /* Common case */
7798 else
7799 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007800}
7801
7802
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007804unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805{
7806 PyUnicodeObject *u;
7807 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007808 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007809 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810
Georg Brandl222de0f2009-04-12 12:01:50 +00007811 if (len < 1) {
7812 Py_INCREF(unicode_empty);
7813 return (PyObject *)unicode_empty;
7814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
Tim Peters7a29bd52001-09-12 03:03:31 +00007816 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 /* no repeat, return original string */
7818 Py_INCREF(str);
7819 return (PyObject*) str;
7820 }
Tim Peters8f422462000-09-09 06:13:41 +00007821
7822 /* ensure # of chars needed doesn't overflow int and # of bytes
7823 * needed doesn't overflow size_t
7824 */
7825 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007826 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007827 PyErr_SetString(PyExc_OverflowError,
7828 "repeated string is too long");
7829 return NULL;
7830 }
7831 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7832 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7833 PyErr_SetString(PyExc_OverflowError,
7834 "repeated string is too long");
7835 return NULL;
7836 }
7837 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 if (!u)
7839 return NULL;
7840
7841 p = u->str;
7842
Georg Brandl222de0f2009-04-12 12:01:50 +00007843 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007844 Py_UNICODE_FILL(p, str->str[0], len);
7845 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007846 Py_ssize_t done = str->length; /* number of characters copied this far */
7847 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007849 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007850 Py_UNICODE_COPY(p+done, p, n);
7851 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 }
7854
7855 return (PyObject*) u;
7856}
7857
7858PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 PyObject *subobj,
7860 PyObject *replobj,
7861 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
7863 PyObject *self;
7864 PyObject *str1;
7865 PyObject *str2;
7866 PyObject *result;
7867
7868 self = PyUnicode_FromObject(obj);
7869 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 str1 = PyUnicode_FromObject(subobj);
7872 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 Py_DECREF(self);
7874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 }
7876 str2 = PyUnicode_FromObject(replobj);
7877 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 Py_DECREF(self);
7879 Py_DECREF(str1);
7880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881 }
Tim Petersced69f82003-09-16 20:30:58 +00007882 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 (PyUnicodeObject *)str1,
7884 (PyUnicodeObject *)str2,
7885 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 Py_DECREF(self);
7887 Py_DECREF(str1);
7888 Py_DECREF(str2);
7889 return result;
7890}
7891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007892PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894\n\
7895Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007896old replaced by new. If the optional argument count is\n\
7897given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898
7899static PyObject*
7900unicode_replace(PyUnicodeObject *self, PyObject *args)
7901{
7902 PyUnicodeObject *str1;
7903 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007904 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 PyObject *result;
7906
Martin v. Löwis18e16552006-02-15 17:27:45 +00007907 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 return NULL;
7909 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7910 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007913 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 Py_DECREF(str1);
7915 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918 result = replace(self, str1, str2, maxcount);
7919
7920 Py_DECREF(str1);
7921 Py_DECREF(str2);
7922 return result;
7923}
7924
7925static
7926PyObject *unicode_repr(PyObject *unicode)
7927{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007928 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007929 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007930 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7931 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7932
7933 /* XXX(nnorwitz): rather than over-allocating, it would be
7934 better to choose a different scheme. Perhaps scan the
7935 first N-chars of the string and allocate based on that size.
7936 */
7937 /* Initial allocation is based on the longest-possible unichr
7938 escape.
7939
7940 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7941 unichr, so in this case it's the longest unichr escape. In
7942 narrow (UTF-16) builds this is five chars per source unichr
7943 since there are two unichrs in the surrogate pair, so in narrow
7944 (UTF-16) builds it's not the longest unichr escape.
7945
7946 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7947 so in the narrow (UTF-16) build case it's the longest unichr
7948 escape.
7949 */
7950
Walter Dörwald1ab83302007-05-18 17:15:44 +00007951 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007953#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007955#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007957#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007959 if (repr == NULL)
7960 return NULL;
7961
Walter Dörwald1ab83302007-05-18 17:15:44 +00007962 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007963
7964 /* Add quote */
7965 *p++ = (findchar(s, size, '\'') &&
7966 !findchar(s, size, '"')) ? '"' : '\'';
7967 while (size-- > 0) {
7968 Py_UNICODE ch = *s++;
7969
7970 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007971 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007972 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007973 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007974 continue;
7975 }
7976
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007978 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007979 *p++ = '\\';
7980 *p++ = 't';
7981 }
7982 else if (ch == '\n') {
7983 *p++ = '\\';
7984 *p++ = 'n';
7985 }
7986 else if (ch == '\r') {
7987 *p++ = '\\';
7988 *p++ = 'r';
7989 }
7990
7991 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007992 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007993 *p++ = '\\';
7994 *p++ = 'x';
7995 *p++ = hexdigits[(ch >> 4) & 0x000F];
7996 *p++ = hexdigits[ch & 0x000F];
7997 }
7998
Georg Brandl559e5d72008-06-11 18:37:52 +00007999 /* Copy ASCII characters as-is */
8000 else if (ch < 0x7F) {
8001 *p++ = ch;
8002 }
8003
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008005 else {
8006 Py_UCS4 ucs = ch;
8007
8008#ifndef Py_UNICODE_WIDE
8009 Py_UNICODE ch2 = 0;
8010 /* Get code point from surrogate pair */
8011 if (size > 0) {
8012 ch2 = *s;
8013 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008018 size--;
8019 }
8020 }
8021#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008023 (categories Z* and C* except ASCII space)
8024 */
8025 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8026 /* Map 8-bit characters to '\xhh' */
8027 if (ucs <= 0xff) {
8028 *p++ = '\\';
8029 *p++ = 'x';
8030 *p++ = hexdigits[(ch >> 4) & 0x000F];
8031 *p++ = hexdigits[ch & 0x000F];
8032 }
8033 /* Map 21-bit characters to '\U00xxxxxx' */
8034 else if (ucs >= 0x10000) {
8035 *p++ = '\\';
8036 *p++ = 'U';
8037 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8038 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8039 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8040 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8041 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8042 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8043 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8044 *p++ = hexdigits[ucs & 0x0000000F];
8045 }
8046 /* Map 16-bit characters to '\uxxxx' */
8047 else {
8048 *p++ = '\\';
8049 *p++ = 'u';
8050 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8051 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8052 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8053 *p++ = hexdigits[ucs & 0x000F];
8054 }
8055 }
8056 /* Copy characters as-is */
8057 else {
8058 *p++ = ch;
8059#ifndef Py_UNICODE_WIDE
8060 if (ucs >= 0x10000)
8061 *p++ = ch2;
8062#endif
8063 }
8064 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008065 }
8066 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008067 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008068
8069 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008070 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008071 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072}
8073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008074PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076\n\
8077Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008078such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079arguments start and end are interpreted as in slice notation.\n\
8080\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008081Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
8083static PyObject *
8084unicode_rfind(PyUnicodeObject *self, PyObject *args)
8085{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008086 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008087 Py_ssize_t start;
8088 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008089 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090
Christian Heimes9cd17752007-11-18 19:35:23 +00008091 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093
Thomas Wouters477c8d52006-05-27 19:21:47 +00008094 result = stringlib_rfind_slice(
8095 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8096 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8097 start, end
8098 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099
8100 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008101
Christian Heimes217cfd12007-12-02 14:31:20 +00008102 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103}
8104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008105PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008108Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109
8110static PyObject *
8111unicode_rindex(PyUnicodeObject *self, PyObject *args)
8112{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008113 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008114 Py_ssize_t start;
8115 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008116 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
Christian Heimes9cd17752007-11-18 19:35:23 +00008118 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Thomas Wouters477c8d52006-05-27 19:21:47 +00008121 result = stringlib_rfind_slice(
8122 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8123 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8124 start, end
8125 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
8127 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 if (result < 0) {
8130 PyErr_SetString(PyExc_ValueError, "substring not found");
8131 return NULL;
8132 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008133 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134}
8135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008136PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008139Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008140done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141
8142static PyObject *
8143unicode_rjust(PyUnicodeObject *self, PyObject *args)
8144{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008145 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008146 Py_UNICODE fillchar = ' ';
8147
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008148 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return NULL;
8150
Tim Peters7a29bd52001-09-12 03:03:31 +00008151 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 Py_INCREF(self);
8153 return (PyObject*) self;
8154 }
8155
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008156 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157}
8158
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 PyObject *sep,
8161 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162{
8163 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008164
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 s = PyUnicode_FromObject(s);
8166 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 if (sep != NULL) {
8169 sep = PyUnicode_FromObject(sep);
8170 if (sep == NULL) {
8171 Py_DECREF(s);
8172 return NULL;
8173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 }
8175
8176 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8177
8178 Py_DECREF(s);
8179 Py_XDECREF(sep);
8180 return result;
8181}
8182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008183PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185\n\
8186Return a list of the words in S, using sep as the\n\
8187delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008188splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008189whitespace string is a separator and empty strings are\n\
8190removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192static PyObject*
8193unicode_split(PyUnicodeObject *self, PyObject *args)
8194{
8195 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008196 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197
Martin v. Löwis18e16552006-02-15 17:27:45 +00008198 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 return NULL;
8200
8201 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207}
8208
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209PyObject *
8210PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8211{
8212 PyObject* str_obj;
8213 PyObject* sep_obj;
8214 PyObject* out;
8215
8216 str_obj = PyUnicode_FromObject(str_in);
8217 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008219 sep_obj = PyUnicode_FromObject(sep_in);
8220 if (!sep_obj) {
8221 Py_DECREF(str_obj);
8222 return NULL;
8223 }
8224
8225 out = stringlib_partition(
8226 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8227 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8228 );
8229
8230 Py_DECREF(sep_obj);
8231 Py_DECREF(str_obj);
8232
8233 return out;
8234}
8235
8236
8237PyObject *
8238PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8239{
8240 PyObject* str_obj;
8241 PyObject* sep_obj;
8242 PyObject* out;
8243
8244 str_obj = PyUnicode_FromObject(str_in);
8245 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008247 sep_obj = PyUnicode_FromObject(sep_in);
8248 if (!sep_obj) {
8249 Py_DECREF(str_obj);
8250 return NULL;
8251 }
8252
8253 out = stringlib_rpartition(
8254 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8255 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8256 );
8257
8258 Py_DECREF(sep_obj);
8259 Py_DECREF(str_obj);
8260
8261 return out;
8262}
8263
8264PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008267Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008268the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008269found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270
8271static PyObject*
8272unicode_partition(PyUnicodeObject *self, PyObject *separator)
8273{
8274 return PyUnicode_Partition((PyObject *)self, separator);
8275}
8276
8277PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008278 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008279\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008280Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008281the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008282separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008283
8284static PyObject*
8285unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8286{
8287 return PyUnicode_RPartition((PyObject *)self, separator);
8288}
8289
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008290PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 PyObject *sep,
8292 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008293{
8294 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008296 s = PyUnicode_FromObject(s);
8297 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008298 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 if (sep != NULL) {
8300 sep = PyUnicode_FromObject(sep);
8301 if (sep == NULL) {
8302 Py_DECREF(s);
8303 return NULL;
8304 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008305 }
8306
8307 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8308
8309 Py_DECREF(s);
8310 Py_XDECREF(sep);
8311 return result;
8312}
8313
8314PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008316\n\
8317Return a list of the words in S, using sep as the\n\
8318delimiter string, starting at the end of the string and\n\
8319working to the front. If maxsplit is given, at most maxsplit\n\
8320splits are done. If sep is not specified, any whitespace string\n\
8321is a separator.");
8322
8323static PyObject*
8324unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8325{
8326 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008328
Martin v. Löwis18e16552006-02-15 17:27:45 +00008329 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008330 return NULL;
8331
8332 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008334 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008336 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008338}
8339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008340PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342\n\
8343Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008344Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008345is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346
8347static PyObject*
8348unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8349{
Guido van Rossum86662912000-04-11 15:38:46 +00008350 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351
Guido van Rossum86662912000-04-11 15:38:46 +00008352 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 return NULL;
8354
Guido van Rossum86662912000-04-11 15:38:46 +00008355 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
8358static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008359PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360{
Walter Dörwald346737f2007-05-31 10:44:43 +00008361 if (PyUnicode_CheckExact(self)) {
8362 Py_INCREF(self);
8363 return self;
8364 } else
8365 /* Subtype -- return genuine unicode string with the same value. */
8366 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8367 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368}
8369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008370PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372\n\
8373Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008374and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375
8376static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008377unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 return fixup(self, fixswapcase);
8380}
8381
Georg Brandlceee0772007-11-27 23:48:05 +00008382PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008384\n\
8385Return a translation table usable for str.translate().\n\
8386If there is only one argument, it must be a dictionary mapping Unicode\n\
8387ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008388Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008389If there are two arguments, they must be strings of equal length, and\n\
8390in the resulting dictionary, each character in x will be mapped to the\n\
8391character at the same position in y. If there is a third argument, it\n\
8392must be a string, whose characters will be mapped to None in the result.");
8393
8394static PyObject*
8395unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8396{
8397 PyObject *x, *y = NULL, *z = NULL;
8398 PyObject *new = NULL, *key, *value;
8399 Py_ssize_t i = 0;
8400 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401
Georg Brandlceee0772007-11-27 23:48:05 +00008402 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8403 return NULL;
8404 new = PyDict_New();
8405 if (!new)
8406 return NULL;
8407 if (y != NULL) {
8408 /* x must be a string too, of equal length */
8409 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8410 if (!PyUnicode_Check(x)) {
8411 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8412 "be a string if there is a second argument");
8413 goto err;
8414 }
8415 if (PyUnicode_GET_SIZE(x) != ylen) {
8416 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8417 "arguments must have equal length");
8418 goto err;
8419 }
8420 /* create entries for translating chars in x to those in y */
8421 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8423 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008424 if (!key || !value)
8425 goto err;
8426 res = PyDict_SetItem(new, key, value);
8427 Py_DECREF(key);
8428 Py_DECREF(value);
8429 if (res < 0)
8430 goto err;
8431 }
8432 /* create entries for deleting chars in z */
8433 if (z != NULL) {
8434 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008435 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008436 if (!key)
8437 goto err;
8438 res = PyDict_SetItem(new, key, Py_None);
8439 Py_DECREF(key);
8440 if (res < 0)
8441 goto err;
8442 }
8443 }
8444 } else {
8445 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008446 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008447 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8448 "to maketrans it must be a dict");
8449 goto err;
8450 }
8451 /* copy entries into the new dict, converting string keys to int keys */
8452 while (PyDict_Next(x, &i, &key, &value)) {
8453 if (PyUnicode_Check(key)) {
8454 /* convert string keys to integer keys */
8455 PyObject *newkey;
8456 if (PyUnicode_GET_SIZE(key) != 1) {
8457 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8458 "table must be of length 1");
8459 goto err;
8460 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008461 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008462 if (!newkey)
8463 goto err;
8464 res = PyDict_SetItem(new, newkey, value);
8465 Py_DECREF(newkey);
8466 if (res < 0)
8467 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008468 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008469 /* just keep integer keys */
8470 if (PyDict_SetItem(new, key, value) < 0)
8471 goto err;
8472 } else {
8473 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8474 "be strings or integers");
8475 goto err;
8476 }
8477 }
8478 }
8479 return new;
8480 err:
8481 Py_DECREF(new);
8482 return NULL;
8483}
8484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008485PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487\n\
8488Return a copy of the string S, where all characters have been mapped\n\
8489through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008490Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008491Unmapped characters are left untouched. Characters mapped to None\n\
8492are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
8494static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008495unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496{
Georg Brandlceee0772007-11-27 23:48:05 +00008497 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498}
8499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008500PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008503Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504
8505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008506unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 return fixup(self, fixupper);
8509}
8510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008511PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008514Pad a numeric string S with zeros on the left, to fill a field\n\
8515of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
8517static PyObject *
8518unicode_zfill(PyUnicodeObject *self, PyObject *args)
8519{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008520 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 PyUnicodeObject *u;
8522
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523 Py_ssize_t width;
8524 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 return NULL;
8526
8527 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008528 if (PyUnicode_CheckExact(self)) {
8529 Py_INCREF(self);
8530 return (PyObject*) self;
8531 }
8532 else
8533 return PyUnicode_FromUnicode(
8534 PyUnicode_AS_UNICODE(self),
8535 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 }
8538
8539 fill = width - self->length;
8540
8541 u = pad(self, fill, 0, '0');
8542
Walter Dörwald068325e2002-04-15 13:36:47 +00008543 if (u == NULL)
8544 return NULL;
8545
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 if (u->str[fill] == '+' || u->str[fill] == '-') {
8547 /* move sign to beginning of string */
8548 u->str[0] = u->str[fill];
8549 u->str[fill] = '0';
8550 }
8551
8552 return (PyObject*) u;
8553}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
8555#if 0
8556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008557unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558{
Christian Heimes2202f872008-02-06 14:31:34 +00008559 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560}
8561#endif
8562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008563PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008566Return True if S starts with the specified prefix, False otherwise.\n\
8567With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008568With optional end, stop comparing S at that position.\n\
8569prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570
8571static PyObject *
8572unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008577 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008578 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008579 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008581 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8583 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008584 if (PyTuple_Check(subobj)) {
8585 Py_ssize_t i;
8586 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8587 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008589 if (substring == NULL)
8590 return NULL;
8591 result = tailmatch(self, substring, start, end, -1);
8592 Py_DECREF(substring);
8593 if (result) {
8594 Py_RETURN_TRUE;
8595 }
8596 }
8597 /* nothing matched */
8598 Py_RETURN_FALSE;
8599 }
8600 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008603 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008605 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
8608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008609PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008612Return True if S ends with the specified suffix, False otherwise.\n\
8613With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008614With optional end, stop comparing S at that position.\n\
8615suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
8617static PyObject *
8618unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008621 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008623 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008624 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008625 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008627 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8629 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008630 if (PyTuple_Check(subobj)) {
8631 Py_ssize_t i;
8632 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8633 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008635 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008637 result = tailmatch(self, substring, start, end, +1);
8638 Py_DECREF(substring);
8639 if (result) {
8640 Py_RETURN_TRUE;
8641 }
8642 }
8643 Py_RETURN_FALSE;
8644 }
8645 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008649 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008651 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652}
8653
Eric Smith8c663262007-08-25 02:26:07 +00008654#include "stringlib/string_format.h"
8655
8656PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008658\n\
8659");
8660
Eric Smith4a7d76d2008-05-30 18:10:19 +00008661static PyObject *
8662unicode__format__(PyObject* self, PyObject* args)
8663{
8664 PyObject *format_spec;
8665
8666 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8667 return NULL;
8668
8669 return _PyUnicode_FormatAdvanced(self,
8670 PyUnicode_AS_UNICODE(format_spec),
8671 PyUnicode_GET_SIZE(format_spec));
8672}
8673
Eric Smith8c663262007-08-25 02:26:07 +00008674PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008676\n\
8677");
8678
8679static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008680unicode__sizeof__(PyUnicodeObject *v)
8681{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008682 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8683 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008684}
8685
8686PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008688
8689static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008690unicode_getnewargs(PyUnicodeObject *v)
8691{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008692 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008693}
8694
8695
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696static PyMethodDef unicode_methods[] = {
8697
8698 /* Order is according to common usage: often used methods should
8699 appear first, since lookup is done sequentially. */
8700
Benjamin Peterson308d6372009-09-18 21:42:35 +00008701 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008702 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8703 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008704 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008705 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8706 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8707 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8708 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8709 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8710 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8711 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008713 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8714 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8715 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008716 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008717 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8718 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8719 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008720 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008721 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008722 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008723 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008724 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8725 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8726 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8727 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8728 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8729 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8730 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8731 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8732 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8733 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8734 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8735 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8736 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8737 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008738 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008739 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008740 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008741 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008742 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008743 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8744 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008745 {"maketrans", (PyCFunction) unicode_maketrans,
8746 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008747 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008748#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008749 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750#endif
8751
8752#if 0
8753 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008754 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755#endif
8756
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 {NULL, NULL}
8759};
8760
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008761static PyObject *
8762unicode_mod(PyObject *v, PyObject *w)
8763{
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 if (!PyUnicode_Check(v)) {
8765 Py_INCREF(Py_NotImplemented);
8766 return Py_NotImplemented;
8767 }
8768 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008769}
8770
8771static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008772 0, /*nb_add*/
8773 0, /*nb_subtract*/
8774 0, /*nb_multiply*/
8775 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008776};
8777
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008779 (lenfunc) unicode_length, /* sq_length */
8780 PyUnicode_Concat, /* sq_concat */
8781 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8782 (ssizeargfunc) unicode_getitem, /* sq_item */
8783 0, /* sq_slice */
8784 0, /* sq_ass_item */
8785 0, /* sq_ass_slice */
8786 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787};
8788
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008789static PyObject*
8790unicode_subscript(PyUnicodeObject* self, PyObject* item)
8791{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008792 if (PyIndex_Check(item)) {
8793 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008794 if (i == -1 && PyErr_Occurred())
8795 return NULL;
8796 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008797 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008798 return unicode_getitem(self, i);
8799 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008800 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008801 Py_UNICODE* source_buf;
8802 Py_UNICODE* result_buf;
8803 PyObject* result;
8804
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008805 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008807 return NULL;
8808 }
8809
8810 if (slicelength <= 0) {
8811 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008812 } else if (start == 0 && step == 1 && slicelength == self->length &&
8813 PyUnicode_CheckExact(self)) {
8814 Py_INCREF(self);
8815 return (PyObject *)self;
8816 } else if (step == 1) {
8817 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008818 } else {
8819 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008820 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8821 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008822
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 if (result_buf == NULL)
8824 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008825
8826 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8827 result_buf[i] = source_buf[cur];
8828 }
Tim Petersced69f82003-09-16 20:30:58 +00008829
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008830 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008831 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008832 return result;
8833 }
8834 } else {
8835 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8836 return NULL;
8837 }
8838}
8839
8840static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008841 (lenfunc)unicode_length, /* mp_length */
8842 (binaryfunc)unicode_subscript, /* mp_subscript */
8843 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008844};
8845
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847/* Helpers for PyUnicode_Format() */
8848
8849static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008852 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 (*p_argidx)++;
8855 if (arglen < 0)
8856 return args;
8857 else
8858 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 }
8860 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 return NULL;
8863}
8864
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008865/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008867static PyObject *
8868formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008870 char *p;
8871 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008873
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 x = PyFloat_AsDouble(v);
8875 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008876 return NULL;
8877
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008880
Eric Smith0923d1d2009-04-16 20:16:10 +00008881 p = PyOS_double_to_string(x, type, prec,
8882 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008883 if (p == NULL)
8884 return NULL;
8885 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008886 PyMem_Free(p);
8887 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888}
8889
Tim Peters38fd5b62000-09-21 05:43:11 +00008890static PyObject*
8891formatlong(PyObject *val, int flags, int prec, int type)
8892{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 char *buf;
8894 int len;
8895 PyObject *str; /* temporary string object. */
8896 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008897
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8899 if (!str)
8900 return NULL;
8901 result = PyUnicode_FromStringAndSize(buf, len);
8902 Py_DECREF(str);
8903 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008904}
8905
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906static int
8907formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008908 size_t buflen,
8909 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008911 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008912 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 if (PyUnicode_GET_SIZE(v) == 1) {
8914 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8915 buf[1] = '\0';
8916 return 1;
8917 }
8918#ifndef Py_UNICODE_WIDE
8919 if (PyUnicode_GET_SIZE(v) == 2) {
8920 /* Decode a valid surrogate pair */
8921 int c0 = PyUnicode_AS_UNICODE(v)[0];
8922 int c1 = PyUnicode_AS_UNICODE(v)[1];
8923 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8924 0xDC00 <= c1 && c1 <= 0xDFFF) {
8925 buf[0] = c0;
8926 buf[1] = c1;
8927 buf[2] = '\0';
8928 return 2;
8929 }
8930 }
8931#endif
8932 goto onError;
8933 }
8934 else {
8935 /* Integer input truncated to a character */
8936 long x;
8937 x = PyLong_AsLong(v);
8938 if (x == -1 && PyErr_Occurred())
8939 goto onError;
8940
8941 if (x < 0 || x > 0x10ffff) {
8942 PyErr_SetString(PyExc_OverflowError,
8943 "%c arg not in range(0x110000)");
8944 return -1;
8945 }
8946
8947#ifndef Py_UNICODE_WIDE
8948 if (x > 0xffff) {
8949 x -= 0x10000;
8950 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8951 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8952 return 2;
8953 }
8954#endif
8955 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008956 buf[1] = '\0';
8957 return 1;
8958 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008959
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008961 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008963 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964}
8965
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008966/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008967 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008968*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008969#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008970
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973{
8974 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008975 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 int args_owned = 0;
8977 PyUnicodeObject *result = NULL;
8978 PyObject *dict = NULL;
8979 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008980
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 PyErr_BadInternalCall();
8983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 }
8985 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008986 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 fmt = PyUnicode_AS_UNICODE(uformat);
8989 fmtcnt = PyUnicode_GET_SIZE(uformat);
8990
8991 reslen = rescnt = fmtcnt + 100;
8992 result = _PyUnicode_New(reslen);
8993 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 res = PyUnicode_AS_UNICODE(result);
8996
8997 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 arglen = PyTuple_Size(args);
8999 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000 }
9001 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 arglen = -1;
9003 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009005 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009006 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008
9009 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 if (*fmt != '%') {
9011 if (--rescnt < 0) {
9012 rescnt = fmtcnt + 100;
9013 reslen += rescnt;
9014 if (_PyUnicode_Resize(&result, reslen) < 0)
9015 goto onError;
9016 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9017 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009018 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009020 }
9021 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 /* Got a format specifier */
9023 int flags = 0;
9024 Py_ssize_t width = -1;
9025 int prec = -1;
9026 Py_UNICODE c = '\0';
9027 Py_UNICODE fill;
9028 int isnumok;
9029 PyObject *v = NULL;
9030 PyObject *temp = NULL;
9031 Py_UNICODE *pbuf;
9032 Py_UNICODE sign;
9033 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009034 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 fmt++;
9037 if (*fmt == '(') {
9038 Py_UNICODE *keystart;
9039 Py_ssize_t keylen;
9040 PyObject *key;
9041 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009042
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 if (dict == NULL) {
9044 PyErr_SetString(PyExc_TypeError,
9045 "format requires a mapping");
9046 goto onError;
9047 }
9048 ++fmt;
9049 --fmtcnt;
9050 keystart = fmt;
9051 /* Skip over balanced parentheses */
9052 while (pcount > 0 && --fmtcnt >= 0) {
9053 if (*fmt == ')')
9054 --pcount;
9055 else if (*fmt == '(')
9056 ++pcount;
9057 fmt++;
9058 }
9059 keylen = fmt - keystart - 1;
9060 if (fmtcnt < 0 || pcount > 0) {
9061 PyErr_SetString(PyExc_ValueError,
9062 "incomplete format key");
9063 goto onError;
9064 }
9065#if 0
9066 /* keys are converted to strings using UTF-8 and
9067 then looked up since Python uses strings to hold
9068 variables names etc. in its namespaces and we
9069 wouldn't want to break common idioms. */
9070 key = PyUnicode_EncodeUTF8(keystart,
9071 keylen,
9072 NULL);
9073#else
9074 key = PyUnicode_FromUnicode(keystart, keylen);
9075#endif
9076 if (key == NULL)
9077 goto onError;
9078 if (args_owned) {
9079 Py_DECREF(args);
9080 args_owned = 0;
9081 }
9082 args = PyObject_GetItem(dict, key);
9083 Py_DECREF(key);
9084 if (args == NULL) {
9085 goto onError;
9086 }
9087 args_owned = 1;
9088 arglen = -1;
9089 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 while (--fmtcnt >= 0) {
9092 switch (c = *fmt++) {
9093 case '-': flags |= F_LJUST; continue;
9094 case '+': flags |= F_SIGN; continue;
9095 case ' ': flags |= F_BLANK; continue;
9096 case '#': flags |= F_ALT; continue;
9097 case '0': flags |= F_ZERO; continue;
9098 }
9099 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 if (c == '*') {
9102 v = getnextarg(args, arglen, &argidx);
9103 if (v == NULL)
9104 goto onError;
9105 if (!PyLong_Check(v)) {
9106 PyErr_SetString(PyExc_TypeError,
9107 "* wants int");
9108 goto onError;
9109 }
9110 width = PyLong_AsLong(v);
9111 if (width == -1 && PyErr_Occurred())
9112 goto onError;
9113 if (width < 0) {
9114 flags |= F_LJUST;
9115 width = -width;
9116 }
9117 if (--fmtcnt >= 0)
9118 c = *fmt++;
9119 }
9120 else if (c >= '0' && c <= '9') {
9121 width = c - '0';
9122 while (--fmtcnt >= 0) {
9123 c = *fmt++;
9124 if (c < '0' || c > '9')
9125 break;
9126 if ((width*10) / 10 != width) {
9127 PyErr_SetString(PyExc_ValueError,
9128 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 }
9131 width = width*10 + (c - '0');
9132 }
9133 }
9134 if (c == '.') {
9135 prec = 0;
9136 if (--fmtcnt >= 0)
9137 c = *fmt++;
9138 if (c == '*') {
9139 v = getnextarg(args, arglen, &argidx);
9140 if (v == NULL)
9141 goto onError;
9142 if (!PyLong_Check(v)) {
9143 PyErr_SetString(PyExc_TypeError,
9144 "* wants int");
9145 goto onError;
9146 }
9147 prec = PyLong_AsLong(v);
9148 if (prec == -1 && PyErr_Occurred())
9149 goto onError;
9150 if (prec < 0)
9151 prec = 0;
9152 if (--fmtcnt >= 0)
9153 c = *fmt++;
9154 }
9155 else if (c >= '0' && c <= '9') {
9156 prec = c - '0';
9157 while (--fmtcnt >= 0) {
9158 c = Py_CHARMASK(*fmt++);
9159 if (c < '0' || c > '9')
9160 break;
9161 if ((prec*10) / 10 != prec) {
9162 PyErr_SetString(PyExc_ValueError,
9163 "prec too big");
9164 goto onError;
9165 }
9166 prec = prec*10 + (c - '0');
9167 }
9168 }
9169 } /* prec */
9170 if (fmtcnt >= 0) {
9171 if (c == 'h' || c == 'l' || c == 'L') {
9172 if (--fmtcnt >= 0)
9173 c = *fmt++;
9174 }
9175 }
9176 if (fmtcnt < 0) {
9177 PyErr_SetString(PyExc_ValueError,
9178 "incomplete format");
9179 goto onError;
9180 }
9181 if (c != '%') {
9182 v = getnextarg(args, arglen, &argidx);
9183 if (v == NULL)
9184 goto onError;
9185 }
9186 sign = 0;
9187 fill = ' ';
9188 switch (c) {
9189
9190 case '%':
9191 pbuf = formatbuf;
9192 /* presume that buffer length is at least 1 */
9193 pbuf[0] = '%';
9194 len = 1;
9195 break;
9196
9197 case 's':
9198 case 'r':
9199 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009200 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 temp = v;
9202 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 }
9204 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 if (c == 's')
9206 temp = PyObject_Str(v);
9207 else if (c == 'r')
9208 temp = PyObject_Repr(v);
9209 else
9210 temp = PyObject_ASCII(v);
9211 if (temp == NULL)
9212 goto onError;
9213 if (PyUnicode_Check(temp))
9214 /* nothing to do */;
9215 else {
9216 Py_DECREF(temp);
9217 PyErr_SetString(PyExc_TypeError,
9218 "%s argument has non-string str()");
9219 goto onError;
9220 }
9221 }
9222 pbuf = PyUnicode_AS_UNICODE(temp);
9223 len = PyUnicode_GET_SIZE(temp);
9224 if (prec >= 0 && len > prec)
9225 len = prec;
9226 break;
9227
9228 case 'i':
9229 case 'd':
9230 case 'u':
9231 case 'o':
9232 case 'x':
9233 case 'X':
9234 if (c == 'i')
9235 c = 'd';
9236 isnumok = 0;
9237 if (PyNumber_Check(v)) {
9238 PyObject *iobj=NULL;
9239
9240 if (PyLong_Check(v)) {
9241 iobj = v;
9242 Py_INCREF(iobj);
9243 }
9244 else {
9245 iobj = PyNumber_Long(v);
9246 }
9247 if (iobj!=NULL) {
9248 if (PyLong_Check(iobj)) {
9249 isnumok = 1;
9250 temp = formatlong(iobj, flags, prec, c);
9251 Py_DECREF(iobj);
9252 if (!temp)
9253 goto onError;
9254 pbuf = PyUnicode_AS_UNICODE(temp);
9255 len = PyUnicode_GET_SIZE(temp);
9256 sign = 1;
9257 }
9258 else {
9259 Py_DECREF(iobj);
9260 }
9261 }
9262 }
9263 if (!isnumok) {
9264 PyErr_Format(PyExc_TypeError,
9265 "%%%c format: a number is required, "
9266 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9267 goto onError;
9268 }
9269 if (flags & F_ZERO)
9270 fill = '0';
9271 break;
9272
9273 case 'e':
9274 case 'E':
9275 case 'f':
9276 case 'F':
9277 case 'g':
9278 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009279 temp = formatfloat(v, flags, prec, c);
9280 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009282 pbuf = PyUnicode_AS_UNICODE(temp);
9283 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009284 sign = 1;
9285 if (flags & F_ZERO)
9286 fill = '0';
9287 break;
9288
9289 case 'c':
9290 pbuf = formatbuf;
9291 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9292 if (len < 0)
9293 goto onError;
9294 break;
9295
9296 default:
9297 PyErr_Format(PyExc_ValueError,
9298 "unsupported format character '%c' (0x%x) "
9299 "at index %zd",
9300 (31<=c && c<=126) ? (char)c : '?',
9301 (int)c,
9302 (Py_ssize_t)(fmt - 1 -
9303 PyUnicode_AS_UNICODE(uformat)));
9304 goto onError;
9305 }
9306 if (sign) {
9307 if (*pbuf == '-' || *pbuf == '+') {
9308 sign = *pbuf++;
9309 len--;
9310 }
9311 else if (flags & F_SIGN)
9312 sign = '+';
9313 else if (flags & F_BLANK)
9314 sign = ' ';
9315 else
9316 sign = 0;
9317 }
9318 if (width < len)
9319 width = len;
9320 if (rescnt - (sign != 0) < width) {
9321 reslen -= rescnt;
9322 rescnt = width + fmtcnt + 100;
9323 reslen += rescnt;
9324 if (reslen < 0) {
9325 Py_XDECREF(temp);
9326 PyErr_NoMemory();
9327 goto onError;
9328 }
9329 if (_PyUnicode_Resize(&result, reslen) < 0) {
9330 Py_XDECREF(temp);
9331 goto onError;
9332 }
9333 res = PyUnicode_AS_UNICODE(result)
9334 + reslen - rescnt;
9335 }
9336 if (sign) {
9337 if (fill != ' ')
9338 *res++ = sign;
9339 rescnt--;
9340 if (width > len)
9341 width--;
9342 }
9343 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9344 assert(pbuf[0] == '0');
9345 assert(pbuf[1] == c);
9346 if (fill != ' ') {
9347 *res++ = *pbuf++;
9348 *res++ = *pbuf++;
9349 }
9350 rescnt -= 2;
9351 width -= 2;
9352 if (width < 0)
9353 width = 0;
9354 len -= 2;
9355 }
9356 if (width > len && !(flags & F_LJUST)) {
9357 do {
9358 --rescnt;
9359 *res++ = fill;
9360 } while (--width > len);
9361 }
9362 if (fill == ' ') {
9363 if (sign)
9364 *res++ = sign;
9365 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9366 assert(pbuf[0] == '0');
9367 assert(pbuf[1] == c);
9368 *res++ = *pbuf++;
9369 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009370 }
9371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 Py_UNICODE_COPY(res, pbuf, len);
9373 res += len;
9374 rescnt -= len;
9375 while (--width >= len) {
9376 --rescnt;
9377 *res++ = ' ';
9378 }
9379 if (dict && (argidx < arglen) && c != '%') {
9380 PyErr_SetString(PyExc_TypeError,
9381 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009382 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 goto onError;
9384 }
9385 Py_XDECREF(temp);
9386 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 } /* until end */
9388 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 PyErr_SetString(PyExc_TypeError,
9390 "not all arguments converted during string formatting");
9391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
9393
Thomas Woutersa96affe2006-03-12 00:29:36 +00009394 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 }
9399 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 return (PyObject *)result;
9401
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 Py_XDECREF(result);
9404 Py_DECREF(uformat);
9405 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 }
9408 return NULL;
9409}
9410
Jeremy Hylton938ace62002-07-17 16:30:39 +00009411static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009412unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9413
Tim Peters6d6c1a32001-08-02 04:15:00 +00009414static PyObject *
9415unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9416{
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009418 static char *kwlist[] = {"object", "encoding", "errors", 0};
9419 char *encoding = NULL;
9420 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009421
Benjamin Peterson14339b62009-01-31 16:36:08 +00009422 if (type != &PyUnicode_Type)
9423 return unicode_subtype_new(type, args, kwds);
9424 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009426 return NULL;
9427 if (x == NULL)
9428 return (PyObject *)_PyUnicode_New(0);
9429 if (encoding == NULL && errors == NULL)
9430 return PyObject_Str(x);
9431 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009433}
9434
Guido van Rossume023fe02001-08-30 03:12:59 +00009435static PyObject *
9436unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9437{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 PyUnicodeObject *tmp, *pnew;
9439 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009440
Benjamin Peterson14339b62009-01-31 16:36:08 +00009441 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9442 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9443 if (tmp == NULL)
9444 return NULL;
9445 assert(PyUnicode_Check(tmp));
9446 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9447 if (pnew == NULL) {
9448 Py_DECREF(tmp);
9449 return NULL;
9450 }
9451 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9452 if (pnew->str == NULL) {
9453 _Py_ForgetReference((PyObject *)pnew);
9454 PyObject_Del(pnew);
9455 Py_DECREF(tmp);
9456 return PyErr_NoMemory();
9457 }
9458 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9459 pnew->length = n;
9460 pnew->hash = tmp->hash;
9461 Py_DECREF(tmp);
9462 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009463}
9464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009465PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009467\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009468Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009469encoding defaults to the current default string encoding.\n\
9470errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009471
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009472static PyObject *unicode_iter(PyObject *seq);
9473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009475 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009476 "str", /* tp_name */
9477 sizeof(PyUnicodeObject), /* tp_size */
9478 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009480 (destructor)unicode_dealloc, /* tp_dealloc */
9481 0, /* tp_print */
9482 0, /* tp_getattr */
9483 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009484 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 unicode_repr, /* tp_repr */
9486 &unicode_as_number, /* tp_as_number */
9487 &unicode_as_sequence, /* tp_as_sequence */
9488 &unicode_as_mapping, /* tp_as_mapping */
9489 (hashfunc) unicode_hash, /* tp_hash*/
9490 0, /* tp_call*/
9491 (reprfunc) unicode_str, /* tp_str */
9492 PyObject_GenericGetAttr, /* tp_getattro */
9493 0, /* tp_setattro */
9494 0, /* tp_as_buffer */
9495 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009497 unicode_doc, /* tp_doc */
9498 0, /* tp_traverse */
9499 0, /* tp_clear */
9500 PyUnicode_RichCompare, /* tp_richcompare */
9501 0, /* tp_weaklistoffset */
9502 unicode_iter, /* tp_iter */
9503 0, /* tp_iternext */
9504 unicode_methods, /* tp_methods */
9505 0, /* tp_members */
9506 0, /* tp_getset */
9507 &PyBaseObject_Type, /* tp_base */
9508 0, /* tp_dict */
9509 0, /* tp_descr_get */
9510 0, /* tp_descr_set */
9511 0, /* tp_dictoffset */
9512 0, /* tp_init */
9513 0, /* tp_alloc */
9514 unicode_new, /* tp_new */
9515 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516};
9517
9518/* Initialize the Unicode implementation */
9519
Thomas Wouters78890102000-07-22 19:25:51 +00009520void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009522 int i;
9523
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524 /* XXX - move this array to unicodectype.c ? */
9525 Py_UNICODE linebreak[] = {
9526 0x000A, /* LINE FEED */
9527 0x000D, /* CARRIAGE RETURN */
9528 0x001C, /* FILE SEPARATOR */
9529 0x001D, /* GROUP SEPARATOR */
9530 0x001E, /* RECORD SEPARATOR */
9531 0x0085, /* NEXT LINE */
9532 0x2028, /* LINE SEPARATOR */
9533 0x2029, /* PARAGRAPH SEPARATOR */
9534 };
9535
Fred Drakee4315f52000-05-09 19:53:39 +00009536 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009537 free_list = NULL;
9538 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009540 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009542
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009543 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009545 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009547
9548 /* initialize the linebreak bloom filter */
9549 bloom_linebreak = make_bloom_mask(
9550 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9551 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009552
9553 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554}
9555
9556/* Finalize the Unicode implementation */
9557
Christian Heimesa156e092008-02-16 07:38:31 +00009558int
9559PyUnicode_ClearFreeList(void)
9560{
9561 int freelist_size = numfree;
9562 PyUnicodeObject *u;
9563
9564 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009565 PyUnicodeObject *v = u;
9566 u = *(PyUnicodeObject **)u;
9567 if (v->str)
9568 PyObject_DEL(v->str);
9569 Py_XDECREF(v->defenc);
9570 PyObject_Del(v);
9571 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009572 }
9573 free_list = NULL;
9574 assert(numfree == 0);
9575 return freelist_size;
9576}
9577
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578void
Thomas Wouters78890102000-07-22 19:25:51 +00009579_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009581 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009583 Py_XDECREF(unicode_empty);
9584 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009585
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009586 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 if (unicode_latin1[i]) {
9588 Py_DECREF(unicode_latin1[i]);
9589 unicode_latin1[i] = NULL;
9590 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009591 }
Christian Heimesa156e092008-02-16 07:38:31 +00009592 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009594
Walter Dörwald16807132007-05-25 13:52:07 +00009595void
9596PyUnicode_InternInPlace(PyObject **p)
9597{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009598 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9599 PyObject *t;
9600 if (s == NULL || !PyUnicode_Check(s))
9601 Py_FatalError(
9602 "PyUnicode_InternInPlace: unicode strings only please!");
9603 /* If it's a subclass, we don't really know what putting
9604 it in the interned dict might do. */
9605 if (!PyUnicode_CheckExact(s))
9606 return;
9607 if (PyUnicode_CHECK_INTERNED(s))
9608 return;
9609 if (interned == NULL) {
9610 interned = PyDict_New();
9611 if (interned == NULL) {
9612 PyErr_Clear(); /* Don't leave an exception */
9613 return;
9614 }
9615 }
9616 /* It might be that the GetItem call fails even
9617 though the key is present in the dictionary,
9618 namely when this happens during a stack overflow. */
9619 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009622
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 if (t) {
9624 Py_INCREF(t);
9625 Py_DECREF(*p);
9626 *p = t;
9627 return;
9628 }
Walter Dörwald16807132007-05-25 13:52:07 +00009629
Benjamin Peterson14339b62009-01-31 16:36:08 +00009630 PyThreadState_GET()->recursion_critical = 1;
9631 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9632 PyErr_Clear();
9633 PyThreadState_GET()->recursion_critical = 0;
9634 return;
9635 }
9636 PyThreadState_GET()->recursion_critical = 0;
9637 /* The two references in interned are not counted by refcnt.
9638 The deallocator will take care of this */
9639 Py_REFCNT(s) -= 2;
9640 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009641}
9642
9643void
9644PyUnicode_InternImmortal(PyObject **p)
9645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 PyUnicode_InternInPlace(p);
9647 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9648 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9649 Py_INCREF(*p);
9650 }
Walter Dörwald16807132007-05-25 13:52:07 +00009651}
9652
9653PyObject *
9654PyUnicode_InternFromString(const char *cp)
9655{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009656 PyObject *s = PyUnicode_FromString(cp);
9657 if (s == NULL)
9658 return NULL;
9659 PyUnicode_InternInPlace(&s);
9660 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009661}
9662
9663void _Py_ReleaseInternedUnicodeStrings(void)
9664{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009665 PyObject *keys;
9666 PyUnicodeObject *s;
9667 Py_ssize_t i, n;
9668 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009669
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 if (interned == NULL || !PyDict_Check(interned))
9671 return;
9672 keys = PyDict_Keys(interned);
9673 if (keys == NULL || !PyList_Check(keys)) {
9674 PyErr_Clear();
9675 return;
9676 }
Walter Dörwald16807132007-05-25 13:52:07 +00009677
Benjamin Peterson14339b62009-01-31 16:36:08 +00009678 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9679 detector, interned unicode strings are not forcibly deallocated;
9680 rather, we give them their stolen references back, and then clear
9681 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009682
Benjamin Peterson14339b62009-01-31 16:36:08 +00009683 n = PyList_GET_SIZE(keys);
9684 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009686 for (i = 0; i < n; i++) {
9687 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9688 switch (s->state) {
9689 case SSTATE_NOT_INTERNED:
9690 /* XXX Shouldn't happen */
9691 break;
9692 case SSTATE_INTERNED_IMMORTAL:
9693 Py_REFCNT(s) += 1;
9694 immortal_size += s->length;
9695 break;
9696 case SSTATE_INTERNED_MORTAL:
9697 Py_REFCNT(s) += 2;
9698 mortal_size += s->length;
9699 break;
9700 default:
9701 Py_FatalError("Inconsistent interned string state.");
9702 }
9703 s->state = SSTATE_NOT_INTERNED;
9704 }
9705 fprintf(stderr, "total size of all interned strings: "
9706 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9707 "mortal/immortal\n", mortal_size, immortal_size);
9708 Py_DECREF(keys);
9709 PyDict_Clear(interned);
9710 Py_DECREF(interned);
9711 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009712}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009713
9714
9715/********************* Unicode Iterator **************************/
9716
9717typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009718 PyObject_HEAD
9719 Py_ssize_t it_index;
9720 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009721} unicodeiterobject;
9722
9723static void
9724unicodeiter_dealloc(unicodeiterobject *it)
9725{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009726 _PyObject_GC_UNTRACK(it);
9727 Py_XDECREF(it->it_seq);
9728 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009729}
9730
9731static int
9732unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 Py_VISIT(it->it_seq);
9735 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009736}
9737
9738static PyObject *
9739unicodeiter_next(unicodeiterobject *it)
9740{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 PyUnicodeObject *seq;
9742 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009743
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 assert(it != NULL);
9745 seq = it->it_seq;
9746 if (seq == NULL)
9747 return NULL;
9748 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009749
Benjamin Peterson14339b62009-01-31 16:36:08 +00009750 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9751 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009752 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009753 if (item != NULL)
9754 ++it->it_index;
9755 return item;
9756 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009757
Benjamin Peterson14339b62009-01-31 16:36:08 +00009758 Py_DECREF(seq);
9759 it->it_seq = NULL;
9760 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009761}
9762
9763static PyObject *
9764unicodeiter_len(unicodeiterobject *it)
9765{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 Py_ssize_t len = 0;
9767 if (it->it_seq)
9768 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9769 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009770}
9771
9772PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9773
9774static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009775 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009776 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009777 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009778};
9779
9780PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9782 "str_iterator", /* tp_name */
9783 sizeof(unicodeiterobject), /* tp_basicsize */
9784 0, /* tp_itemsize */
9785 /* methods */
9786 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9787 0, /* tp_print */
9788 0, /* tp_getattr */
9789 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009790 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 0, /* tp_repr */
9792 0, /* tp_as_number */
9793 0, /* tp_as_sequence */
9794 0, /* tp_as_mapping */
9795 0, /* tp_hash */
9796 0, /* tp_call */
9797 0, /* tp_str */
9798 PyObject_GenericGetAttr, /* tp_getattro */
9799 0, /* tp_setattro */
9800 0, /* tp_as_buffer */
9801 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9802 0, /* tp_doc */
9803 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9804 0, /* tp_clear */
9805 0, /* tp_richcompare */
9806 0, /* tp_weaklistoffset */
9807 PyObject_SelfIter, /* tp_iter */
9808 (iternextfunc)unicodeiter_next, /* tp_iternext */
9809 unicodeiter_methods, /* tp_methods */
9810 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009811};
9812
9813static PyObject *
9814unicode_iter(PyObject *seq)
9815{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009816 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009817
Benjamin Peterson14339b62009-01-31 16:36:08 +00009818 if (!PyUnicode_Check(seq)) {
9819 PyErr_BadInternalCall();
9820 return NULL;
9821 }
9822 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9823 if (it == NULL)
9824 return NULL;
9825 it->it_index = 0;
9826 Py_INCREF(seq);
9827 it->it_seq = (PyUnicodeObject *)seq;
9828 _PyObject_GC_TRACK(it);
9829 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009830}
9831
Martin v. Löwis5b222132007-06-10 09:51:05 +00009832size_t
9833Py_UNICODE_strlen(const Py_UNICODE *u)
9834{
9835 int res = 0;
9836 while(*u++)
9837 res++;
9838 return res;
9839}
9840
9841Py_UNICODE*
9842Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9843{
9844 Py_UNICODE *u = s1;
9845 while ((*u++ = *s2++));
9846 return s1;
9847}
9848
9849Py_UNICODE*
9850Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9851{
9852 Py_UNICODE *u = s1;
9853 while ((*u++ = *s2++))
9854 if (n-- == 0)
9855 break;
9856 return s1;
9857}
9858
9859int
9860Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9861{
9862 while (*s1 && *s2 && *s1 == *s2)
9863 s1++, s2++;
9864 if (*s1 && *s2)
9865 return (*s1 < *s2) ? -1 : +1;
9866 if (*s1)
9867 return 1;
9868 if (*s2)
9869 return -1;
9870 return 0;
9871}
9872
9873Py_UNICODE*
9874Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9875{
9876 const Py_UNICODE *p;
9877 for (p = s; *p; p++)
9878 if (*p == c)
9879 return (Py_UNICODE*)p;
9880 return NULL;
9881}
9882
9883
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009884#ifdef __cplusplus
9885}
9886#endif