blob: b97621b9344639cc3d5e7ecd88086e1eced14c6a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 Py_ssize_t size,
1298 const char *encoding,
1299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001302 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 char lower[20]; /* Enough for any encoding name we recognize */
1304 char *l;
1305 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
1307 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 encoding = PyUnicode_GetDefaultEncoding();
1309
1310 /* Convert encoding to lower case and replace '_' with '-' in order to
1311 catch e.g. UTF_8 */
1312 e = encoding;
1313 l = lower;
1314 while (*e && l < &lower[(sizeof lower) - 2]) {
1315 if (ISUPPER(*e)) {
1316 *l++ = TOLOWER(*e++);
1317 }
1318 else if (*e == '_') {
1319 *l++ = '-';
1320 e++;
1321 }
1322 else {
1323 *l++ = *e++;
1324 }
1325 }
1326 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001327
1328 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001331 else if ((strcmp(lower, "latin-1") == 0) ||
1332 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001334#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001335 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336 return PyUnicode_DecodeMBCS(s, size, errors);
1337#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001339 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001340 else if (strcmp(lower, "utf-16") == 0)
1341 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1342 else if (strcmp(lower, "utf-32") == 0)
1343 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001346 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001347 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001348 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001349 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 if (buffer == NULL)
1351 goto onError;
1352 unicode = PyCodec_Decode(buffer, encoding, errors);
1353 if (unicode == NULL)
1354 goto onError;
1355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001357 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001358 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 Py_DECREF(unicode);
1360 goto onError;
1361 }
1362 Py_DECREF(buffer);
1363 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 Py_XDECREF(buffer);
1367 return NULL;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Decode via the codec registry */
1385 v = PyCodec_Decode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
1399
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404
1405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407
1408 /* Decode via the codec registry */
1409 v = PyCodec_Decode(unicode, encoding, errors);
1410 if (v == NULL)
1411 goto onError;
1412 if (!PyUnicode_Check(v)) {
1413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001414 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001415 Py_TYPE(v)->tp_name);
1416 Py_DECREF(v);
1417 goto onError;
1418 }
1419 return v;
1420
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422 return NULL;
1423}
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 Py_ssize_t size,
1427 const char *encoding,
1428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 unicode = PyUnicode_FromUnicode(s, size);
1433 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1436 Py_DECREF(unicode);
1437 return v;
1438}
1439
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001440PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1441 const char *encoding,
1442 const char *errors)
1443{
1444 PyObject *v;
1445
1446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 goto onError;
1449 }
1450
1451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453
1454 /* Encode via the codec registry */
1455 v = PyCodec_Encode(unicode, encoding, errors);
1456 if (v == NULL)
1457 goto onError;
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001461 return NULL;
1462}
1463
Victor Stinnerae6265f2010-05-15 16:27:27 +00001464PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1465{
1466 if (Py_FileSystemDefaultEncoding)
1467 return PyUnicode_AsEncodedString(unicode,
1468 Py_FileSystemDefaultEncoding,
1469 "surrogateescape");
1470 else
1471 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1472 PyUnicode_GET_SIZE(unicode),
1473 "surrogateescape");
1474}
1475
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1477 const char *encoding,
1478 const char *errors)
1479{
1480 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001481
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 if (!PyUnicode_Check(unicode)) {
1483 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 }
Fred Drakee4315f52000-05-09 19:53:39 +00001486
Tim Petersced69f82003-09-16 20:30:58 +00001487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001489
1490 /* Shortcuts for common default encodings */
Victor Stinner59e62db2010-05-15 13:14:32 +00001491 if (strcmp(encoding, "utf-8") == 0)
1492 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1493 PyUnicode_GET_SIZE(unicode),
1494 errors);
1495 else if (strcmp(encoding, "latin-1") == 0)
1496 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1497 PyUnicode_GET_SIZE(unicode),
1498 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001499#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner59e62db2010-05-15 13:14:32 +00001500 else if (strcmp(encoding, "mbcs") == 0)
1501 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1502 PyUnicode_GET_SIZE(unicode),
1503 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001504#endif
Victor Stinner59e62db2010-05-15 13:14:32 +00001505 else if (strcmp(encoding, "ascii") == 0)
1506 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1507 PyUnicode_GET_SIZE(unicode),
1508 errors);
1509 /* During bootstrap, we may need to find the encodings
1510 package, to load the file system encoding, and require the
1511 file system encoding in order to load the encodings
1512 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001513
Victor Stinner59e62db2010-05-15 13:14:32 +00001514 Break out of this dependency by assuming that the path to
1515 the encodings module is ASCII-only. XXX could try wcstombs
1516 instead, if the file system encoding is the locale's
1517 encoding. */
1518 else if (Py_FileSystemDefaultEncoding &&
1519 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1520 !PyThreadState_GET()->interp->codecs_initialized)
1521 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1522 PyUnicode_GET_SIZE(unicode),
1523 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001524
1525 /* Encode via the codec registry */
1526 v = PyCodec_Encode(unicode, encoding, errors);
1527 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001528 return NULL;
1529
1530 /* The normal path */
1531 if (PyBytes_Check(v))
1532 return v;
1533
1534 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535 if (PyByteArray_Check(v)) {
1536 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001537 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001538 PyOS_snprintf(msg, sizeof(msg),
1539 "encoder %s returned buffer instead of bytes",
1540 encoding);
1541 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001542 Py_DECREF(v);
1543 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001544 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001545
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001546 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1547 Py_DECREF(v);
1548 return b;
1549 }
1550
1551 PyErr_Format(PyExc_TypeError,
1552 "encoder did not return a bytes object (type=%.400s)",
1553 Py_TYPE(v)->tp_name);
1554 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001555 return NULL;
1556}
1557
1558PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1559 const char *encoding,
1560 const char *errors)
1561{
1562 PyObject *v;
1563
1564 if (!PyUnicode_Check(unicode)) {
1565 PyErr_BadArgument();
1566 goto onError;
1567 }
1568
1569 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001571
1572 /* Encode via the codec registry */
1573 v = PyCodec_Encode(unicode, encoding, errors);
1574 if (v == NULL)
1575 goto onError;
1576 if (!PyUnicode_Check(v)) {
1577 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001578 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001579 Py_TYPE(v)->tp_name);
1580 Py_DECREF(v);
1581 goto onError;
1582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 return NULL;
1587}
1588
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001589PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001590 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001591{
1592 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001593 if (v)
1594 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001595 if (errors != NULL)
1596 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001597 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001598 PyUnicode_GET_SIZE(unicode),
1599 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001600 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001601 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001602 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001603 return v;
1604}
1605
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001606PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001607PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001608 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001609 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1610}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001611
Christian Heimes5894ba72007-11-04 11:43:14 +00001612PyObject*
1613PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1614{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001615 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1616 can be undefined. If it is case, decode using UTF-8. The following assumes
1617 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1618 bootstrapping process where the codecs aren't ready yet.
1619 */
1620 if (Py_FileSystemDefaultEncoding) {
1621#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001622 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001623 return PyUnicode_DecodeMBCS(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001624 }
1625#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001626 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001627 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001628 }
1629#endif
1630 return PyUnicode_Decode(s, size,
1631 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001632 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001633 }
1634 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001635 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001636 }
1637}
1638
Martin v. Löwis011e8422009-05-05 04:43:17 +00001639/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001640 system encoding. The addr param must be a PyObject**.
1641 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001642
1643int
1644PyUnicode_FSConverter(PyObject* arg, void* addr)
1645{
1646 PyObject *output = NULL;
1647 Py_ssize_t size;
1648 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001649 if (arg == NULL) {
1650 Py_DECREF(*(PyObject**)addr);
1651 return 1;
1652 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001653 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001654 output = arg;
1655 Py_INCREF(output);
1656 }
1657 else {
1658 arg = PyUnicode_FromObject(arg);
1659 if (!arg)
1660 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001661 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001662 Py_DECREF(arg);
1663 if (!output)
1664 return 0;
1665 if (!PyBytes_Check(output)) {
1666 Py_DECREF(output);
1667 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1668 return 0;
1669 }
1670 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001671 size = PyBytes_GET_SIZE(output);
1672 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001673 if (size != strlen(data)) {
1674 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1675 Py_DECREF(output);
1676 return 0;
1677 }
1678 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001679 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001680}
1681
1682
Martin v. Löwis5b222132007-06-10 09:51:05 +00001683char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001684_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001685{
Christian Heimesf3863112007-11-22 07:46:41 +00001686 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001687 if (!PyUnicode_Check(unicode)) {
1688 PyErr_BadArgument();
1689 return NULL;
1690 }
Christian Heimesf3863112007-11-22 07:46:41 +00001691 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1692 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001693 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001694 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001695 *psize = PyBytes_GET_SIZE(bytes);
1696 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001697}
1698
1699char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001700_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001701{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001702 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001703}
1704
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1706{
1707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
1709 goto onError;
1710 }
1711 return PyUnicode_AS_UNICODE(unicode);
1712
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 return NULL;
1715}
1716
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718{
1719 if (!PyUnicode_Check(unicode)) {
1720 PyErr_BadArgument();
1721 goto onError;
1722 }
1723 return PyUnicode_GET_SIZE(unicode);
1724
Benjamin Peterson29060642009-01-31 22:14:21 +00001725 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 return -1;
1727}
1728
Thomas Wouters78890102000-07-22 19:25:51 +00001729const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001730{
1731 return unicode_default_encoding;
1732}
1733
1734int PyUnicode_SetDefaultEncoding(const char *encoding)
1735{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001736 if (strcmp(encoding, unicode_default_encoding) != 0) {
1737 PyErr_Format(PyExc_ValueError,
1738 "Can only set default encoding to %s",
1739 unicode_default_encoding);
1740 return -1;
1741 }
Fred Drakee4315f52000-05-09 19:53:39 +00001742 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001743}
1744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745/* error handling callback helper:
1746 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001747 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 and adjust various state variables.
1749 return 0 on success, -1 on error
1750*/
1751
1752static
1753int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001754 const char *encoding, const char *reason,
1755 const char **input, const char **inend, Py_ssize_t *startinpos,
1756 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1757 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001759 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760
1761 PyObject *restuple = NULL;
1762 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001763 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001764 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t requiredsize;
1766 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001768 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001769 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 int res = -1;
1771
1772 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 *errorHandler = PyCodec_LookupError(errors);
1774 if (*errorHandler == NULL)
1775 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 }
1777
1778 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1781 if (*exceptionObject == NULL)
1782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 }
1784 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1786 goto onError;
1787 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1788 goto onError;
1789 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1790 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 }
1792
1793 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1794 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001797 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 }
1800 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001802
1803 /* Copy back the bytes variables, which might have been modified by the
1804 callback */
1805 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1806 if (!inputobj)
1807 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001808 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001810 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001811 *input = PyBytes_AS_STRING(inputobj);
1812 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001813 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001814 /* we can DECREF safely, as the exception has another reference,
1815 so the object won't go away. */
1816 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001820 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1822 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824
1825 /* need more space? (at least enough for what we
1826 have+the replacement+the rest of the string (starting
1827 at the new input position), so we won't have to check space
1828 when there are no errors in the rest of the string) */
1829 repptr = PyUnicode_AS_UNICODE(repunicode);
1830 repsize = PyUnicode_GET_SIZE(repunicode);
1831 requiredsize = *outpos + repsize + insize-newpos;
1832 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001833 if (requiredsize<2*outsize)
1834 requiredsize = 2*outsize;
1835 if (_PyUnicode_Resize(output, requiredsize) < 0)
1836 goto onError;
1837 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 }
1839 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001840 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 Py_UNICODE_COPY(*outptr, repptr, repsize);
1842 *outptr += repsize;
1843 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001844
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 /* we made it! */
1846 res = 0;
1847
Benjamin Peterson29060642009-01-31 22:14:21 +00001848 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 Py_XDECREF(restuple);
1850 return res;
1851}
1852
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853/* --- UTF-7 Codec -------------------------------------------------------- */
1854
Antoine Pitrou244651a2009-05-04 18:56:13 +00001855/* See RFC2152 for details. We encode conservatively and decode liberally. */
1856
1857/* Three simple macros defining base-64. */
1858
1859/* Is c a base-64 character? */
1860
1861#define IS_BASE64(c) \
1862 (((c) >= 'A' && (c) <= 'Z') || \
1863 ((c) >= 'a' && (c) <= 'z') || \
1864 ((c) >= '0' && (c) <= '9') || \
1865 (c) == '+' || (c) == '/')
1866
1867/* given that c is a base-64 character, what is its base-64 value? */
1868
1869#define FROM_BASE64(c) \
1870 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1871 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1872 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1873 (c) == '+' ? 62 : 63)
1874
1875/* What is the base-64 character of the bottom 6 bits of n? */
1876
1877#define TO_BASE64(n) \
1878 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1879
1880/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1881 * decoded as itself. We are permissive on decoding; the only ASCII
1882 * byte not decoding to itself is the + which begins a base64
1883 * string. */
1884
1885#define DECODE_DIRECT(c) \
1886 ((c) <= 127 && (c) != '+')
1887
1888/* The UTF-7 encoder treats ASCII characters differently according to
1889 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1890 * the above). See RFC2152. This array identifies these different
1891 * sets:
1892 * 0 : "Set D"
1893 * alphanumeric and '(),-./:?
1894 * 1 : "Set O"
1895 * !"#$%&*;<=>@[]^_`{|}
1896 * 2 : "whitespace"
1897 * ht nl cr sp
1898 * 3 : special (must be base64 encoded)
1899 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1900 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901
Tim Petersced69f82003-09-16 20:30:58 +00001902static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001903char utf7_category[128] = {
1904/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1905 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1906/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1907 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1908/* sp ! " # $ % & ' ( ) * + , - . / */
1909 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1910/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1911 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1912/* @ A B C D E F G H I J K L M N O */
1913 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1914/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1916/* ` a b c d e f g h i j k l m n o */
1917 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1918/* p q r s t u v w x y z { | } ~ del */
1919 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920};
1921
Antoine Pitrou244651a2009-05-04 18:56:13 +00001922/* ENCODE_DIRECT: this character should be encoded as itself. The
1923 * answer depends on whether we are encoding set O as itself, and also
1924 * on whether we are encoding whitespace as itself. RFC2152 makes it
1925 * clear that the answers to these questions vary between
1926 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001927
Antoine Pitrou244651a2009-05-04 18:56:13 +00001928#define ENCODE_DIRECT(c, directO, directWS) \
1929 ((c) < 128 && (c) > 0 && \
1930 ((utf7_category[(c)] == 0) || \
1931 (directWS && (utf7_category[(c)] == 2)) || \
1932 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001933
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001935 Py_ssize_t size,
1936 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001937{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001938 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1939}
1940
Antoine Pitrou244651a2009-05-04 18:56:13 +00001941/* The decoder. The only state we preserve is our read position,
1942 * i.e. how many characters we have consumed. So if we end in the
1943 * middle of a shift sequence we have to back off the read position
1944 * and the output to the beginning of the sequence, otherwise we lose
1945 * all the shift state (seen bits, number of bits seen, high
1946 * surrogate). */
1947
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001948PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001949 Py_ssize_t size,
1950 const char *errors,
1951 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001952{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001954 Py_ssize_t startinpos;
1955 Py_ssize_t endinpos;
1956 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 const char *e;
1958 PyUnicodeObject *unicode;
1959 Py_UNICODE *p;
1960 const char *errmsg = "";
1961 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001962 Py_UNICODE *shiftOutStart;
1963 unsigned int base64bits = 0;
1964 unsigned long base64buffer = 0;
1965 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001966 PyObject *errorHandler = NULL;
1967 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968
1969 unicode = _PyUnicode_New(size);
1970 if (!unicode)
1971 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001972 if (size == 0) {
1973 if (consumed)
1974 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001975 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001976 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001977
1978 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001979 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001980 e = s + size;
1981
1982 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001983 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001984 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001985 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001986
Antoine Pitrou244651a2009-05-04 18:56:13 +00001987 if (inShift) { /* in a base-64 section */
1988 if (IS_BASE64(ch)) { /* consume a base-64 character */
1989 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1990 base64bits += 6;
1991 s++;
1992 if (base64bits >= 16) {
1993 /* we have enough bits for a UTF-16 value */
1994 Py_UNICODE outCh = (Py_UNICODE)
1995 (base64buffer >> (base64bits-16));
1996 base64bits -= 16;
1997 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1998 if (surrogate) {
1999 /* expecting a second surrogate */
2000 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2001#ifdef Py_UNICODE_WIDE
2002 *p++ = (((surrogate & 0x3FF)<<10)
2003 | (outCh & 0x3FF)) + 0x10000;
2004#else
2005 *p++ = surrogate;
2006 *p++ = outCh;
2007#endif
2008 surrogate = 0;
2009 }
2010 else {
2011 surrogate = 0;
2012 errmsg = "second surrogate missing";
2013 goto utf7Error;
2014 }
2015 }
2016 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2017 /* first surrogate */
2018 surrogate = outCh;
2019 }
2020 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2021 errmsg = "unexpected second surrogate";
2022 goto utf7Error;
2023 }
2024 else {
2025 *p++ = outCh;
2026 }
2027 }
2028 }
2029 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030 inShift = 0;
2031 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 if (surrogate) {
2033 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002034 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002036 if (base64bits > 0) { /* left-over bits */
2037 if (base64bits >= 6) {
2038 /* We've seen at least one base-64 character */
2039 errmsg = "partial character in shift sequence";
2040 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002042 else {
2043 /* Some bits remain; they should be zero */
2044 if (base64buffer != 0) {
2045 errmsg = "non-zero padding bits in shift sequence";
2046 goto utf7Error;
2047 }
2048 }
2049 }
2050 if (ch != '-') {
2051 /* '-' is absorbed; other terminating
2052 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 *p++ = ch;
2054 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 }
2056 }
2057 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002059 s++; /* consume '+' */
2060 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002061 s++;
2062 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002063 }
2064 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002066 shiftOutStart = p;
2067 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002068 }
2069 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002070 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002071 *p++ = ch;
2072 s++;
2073 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 else {
2075 startinpos = s-starts;
2076 s++;
2077 errmsg = "unexpected special character";
2078 goto utf7Error;
2079 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002081utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(unicode);
2083 endinpos = s-starts;
2084 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 errors, &errorHandler,
2086 "utf7", errmsg,
2087 &starts, &e, &startinpos, &endinpos, &exc, &s,
2088 &unicode, &outpos, &p))
2089 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 }
2091
Antoine Pitrou244651a2009-05-04 18:56:13 +00002092 /* end of string */
2093
2094 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2095 /* if we're in an inconsistent state, that's an error */
2096 if (surrogate ||
2097 (base64bits >= 6) ||
2098 (base64bits > 0 && base64buffer != 0)) {
2099 outpos = p-PyUnicode_AS_UNICODE(unicode);
2100 endinpos = size;
2101 if (unicode_decode_call_errorhandler(
2102 errors, &errorHandler,
2103 "utf7", "unterminated shift sequence",
2104 &starts, &e, &startinpos, &endinpos, &exc, &s,
2105 &unicode, &outpos, &p))
2106 goto onError;
2107 if (s < e)
2108 goto restart;
2109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002111
2112 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002113 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002114 if (inShift) {
2115 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002116 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002117 }
2118 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002119 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002120 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002121 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002122
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002123 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124 goto onError;
2125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 Py_XDECREF(errorHandler);
2127 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128 return (PyObject *)unicode;
2129
Benjamin Peterson29060642009-01-31 22:14:21 +00002130 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 Py_XDECREF(errorHandler);
2132 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 Py_DECREF(unicode);
2134 return NULL;
2135}
2136
2137
2138PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002139 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140 int base64SetO,
2141 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002142 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002144 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002146 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002148 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002149 unsigned int base64bits = 0;
2150 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002151 char * out;
2152 char * start;
2153
2154 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002155 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002157 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002158 return PyErr_NoMemory();
2159
Antoine Pitrou244651a2009-05-04 18:56:13 +00002160 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002161 if (v == NULL)
2162 return NULL;
2163
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002164 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002165 for (;i < size; ++i) {
2166 Py_UNICODE ch = s[i];
2167
Antoine Pitrou244651a2009-05-04 18:56:13 +00002168 if (inShift) {
2169 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2170 /* shifting out */
2171 if (base64bits) { /* output remaining bits */
2172 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2173 base64buffer = 0;
2174 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175 }
2176 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002177 /* Characters not in the BASE64 set implicitly unshift the sequence
2178 so no '-' is required, except if the character is itself a '-' */
2179 if (IS_BASE64(ch) || ch == '-') {
2180 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002182 *out++ = (char) ch;
2183 }
2184 else {
2185 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002188 else { /* not in a shift sequence */
2189 if (ch == '+') {
2190 *out++ = '+';
2191 *out++ = '-';
2192 }
2193 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2194 *out++ = (char) ch;
2195 }
2196 else {
2197 *out++ = '+';
2198 inShift = 1;
2199 goto encode_char;
2200 }
2201 }
2202 continue;
2203encode_char:
2204#ifdef Py_UNICODE_WIDE
2205 if (ch >= 0x10000) {
2206 /* code first surrogate */
2207 base64bits += 16;
2208 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2209 while (base64bits >= 6) {
2210 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2211 base64bits -= 6;
2212 }
2213 /* prepare second surrogate */
2214 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2215 }
2216#endif
2217 base64bits += 16;
2218 base64buffer = (base64buffer << 16) | ch;
2219 while (base64bits >= 6) {
2220 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2221 base64bits -= 6;
2222 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002224 if (base64bits)
2225 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2226 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002228 if (_PyBytes_Resize(&v, out - start) < 0)
2229 return NULL;
2230 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002231}
2232
Antoine Pitrou244651a2009-05-04 18:56:13 +00002233#undef IS_BASE64
2234#undef FROM_BASE64
2235#undef TO_BASE64
2236#undef DECODE_DIRECT
2237#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239/* --- UTF-8 Codec -------------------------------------------------------- */
2240
Tim Petersced69f82003-09-16 20:30:58 +00002241static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242char utf8_code_length[256] = {
2243 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2244 illegal prefix. see RFC 2279 for details */
2245 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2246 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2247 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2257 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2258 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2259 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2260 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2261};
2262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 Py_ssize_t size,
2265 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266{
Walter Dörwald69652032004-09-07 20:24:22 +00002267 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2268}
2269
Antoine Pitrouab868312009-01-10 15:40:25 +00002270/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2271#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2272
2273/* Mask to quickly check whether a C 'long' contains a
2274 non-ASCII, UTF8-encoded char. */
2275#if (SIZEOF_LONG == 8)
2276# define ASCII_CHAR_MASK 0x8080808080808080L
2277#elif (SIZEOF_LONG == 4)
2278# define ASCII_CHAR_MASK 0x80808080L
2279#else
2280# error C 'long' size should be either 4 or 8!
2281#endif
2282
Walter Dörwald69652032004-09-07 20:24:22 +00002283PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002284 Py_ssize_t size,
2285 const char *errors,
2286 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002290 Py_ssize_t startinpos;
2291 Py_ssize_t endinpos;
2292 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002293 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 PyUnicodeObject *unicode;
2295 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002296 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002297 PyObject *errorHandler = NULL;
2298 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299
2300 /* Note: size will always be longer than the resulting Unicode
2301 character count */
2302 unicode = _PyUnicode_New(size);
2303 if (!unicode)
2304 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002305 if (size == 0) {
2306 if (consumed)
2307 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310
2311 /* Unpack UTF-8 encoded data */
2312 p = unicode->str;
2313 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002314 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315
2316 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002317 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002320 /* Fast path for runs of ASCII characters. Given that common UTF-8
2321 input will consist of an overwhelming majority of ASCII
2322 characters, we try to optimize for this case by checking
2323 as many characters as a C 'long' can contain.
2324 First, check if we can do an aligned read, as most CPUs have
2325 a penalty for unaligned reads.
2326 */
2327 if (!((size_t) s & LONG_PTR_MASK)) {
2328 /* Help register allocation */
2329 register const char *_s = s;
2330 register Py_UNICODE *_p = p;
2331 while (_s < aligned_end) {
2332 /* Read a whole long at a time (either 4 or 8 bytes),
2333 and do a fast unrolled copy if it only contains ASCII
2334 characters. */
2335 unsigned long data = *(unsigned long *) _s;
2336 if (data & ASCII_CHAR_MASK)
2337 break;
2338 _p[0] = (unsigned char) _s[0];
2339 _p[1] = (unsigned char) _s[1];
2340 _p[2] = (unsigned char) _s[2];
2341 _p[3] = (unsigned char) _s[3];
2342#if (SIZEOF_LONG == 8)
2343 _p[4] = (unsigned char) _s[4];
2344 _p[5] = (unsigned char) _s[5];
2345 _p[6] = (unsigned char) _s[6];
2346 _p[7] = (unsigned char) _s[7];
2347#endif
2348 _s += SIZEOF_LONG;
2349 _p += SIZEOF_LONG;
2350 }
2351 s = _s;
2352 p = _p;
2353 if (s == e)
2354 break;
2355 ch = (unsigned char)*s;
2356 }
2357 }
2358
2359 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002360 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361 s++;
2362 continue;
2363 }
2364
2365 n = utf8_code_length[ch];
2366
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002367 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002368 if (consumed)
2369 break;
2370 else {
2371 errmsg = "unexpected end of data";
2372 startinpos = s-starts;
2373 endinpos = size;
2374 goto utf8Error;
2375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377
2378 switch (n) {
2379
2380 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 startinpos = s-starts;
2383 endinpos = startinpos+1;
2384 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385
2386 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002387 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002388 startinpos = s-starts;
2389 endinpos = startinpos+1;
2390 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391
2392 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002393 if ((s[1] & 0xc0) != 0x80) {
2394 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 startinpos = s-starts;
2396 endinpos = startinpos+2;
2397 goto utf8Error;
2398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 startinpos = s-starts;
2402 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002403 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002404 goto utf8Error;
2405 }
2406 else
2407 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 break;
2409
2410 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002411 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 (s[2] & 0xc0) != 0x80) {
2413 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 startinpos = s-starts;
2415 endinpos = startinpos+3;
2416 goto utf8Error;
2417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002419 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002420 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 startinpos = s-starts;
2422 endinpos = startinpos+3;
2423 goto utf8Error;
2424 }
2425 else
2426 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002427 break;
2428
2429 case 4:
2430 if ((s[1] & 0xc0) != 0x80 ||
2431 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002432 (s[3] & 0xc0) != 0x80) {
2433 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 startinpos = s-starts;
2435 endinpos = startinpos+4;
2436 goto utf8Error;
2437 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002438 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002440 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002441 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002442 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002443 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 UTF-16 */
2445 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002446 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 startinpos = s-starts;
2448 endinpos = startinpos+4;
2449 goto utf8Error;
2450 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002451#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002453#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002454 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002455
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002456 /* translate from 10000..10FFFF to 0..FFFF */
2457 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002458
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002459 /* high surrogate = top 10 bits added to D800 */
2460 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002461
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002462 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002463 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002464#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 break;
2466
2467 default:
2468 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002469 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002470 startinpos = s-starts;
2471 endinpos = startinpos+n;
2472 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 }
2474 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002475 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002476
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 utf8Error:
2478 outpos = p-PyUnicode_AS_UNICODE(unicode);
2479 if (unicode_decode_call_errorhandler(
2480 errors, &errorHandler,
2481 "utf8", errmsg,
2482 &starts, &e, &startinpos, &endinpos, &exc, &s,
2483 &unicode, &outpos, &p))
2484 goto onError;
2485 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 }
Walter Dörwald69652032004-09-07 20:24:22 +00002487 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
2490 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002491 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 goto onError;
2493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return (PyObject *)unicode;
2497
Benjamin Peterson29060642009-01-31 22:14:21 +00002498 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 Py_DECREF(unicode);
2502 return NULL;
2503}
2504
Antoine Pitrouab868312009-01-10 15:40:25 +00002505#undef ASCII_CHAR_MASK
2506
2507
Tim Peters602f7402002-04-27 18:03:26 +00002508/* Allocation strategy: if the string is short, convert into a stack buffer
2509 and allocate exactly as much space needed at the end. Else allocate the
2510 maximum possible needed (4 result bytes per Unicode character), and return
2511 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002512*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002513PyObject *
2514PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002515 Py_ssize_t size,
2516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517{
Tim Peters602f7402002-04-27 18:03:26 +00002518#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002519
Guido van Rossum98297ee2007-11-06 21:34:58 +00002520 Py_ssize_t i; /* index into s of next input byte */
2521 PyObject *result; /* result string object */
2522 char *p; /* next free byte in output buffer */
2523 Py_ssize_t nallocated; /* number of result bytes allocated */
2524 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002525 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002526 PyObject *errorHandler = NULL;
2527 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002528
Tim Peters602f7402002-04-27 18:03:26 +00002529 assert(s != NULL);
2530 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
Tim Peters602f7402002-04-27 18:03:26 +00002532 if (size <= MAX_SHORT_UNICHARS) {
2533 /* Write into the stack buffer; nallocated can't overflow.
2534 * At the end, we'll allocate exactly as much heap space as it
2535 * turns out we need.
2536 */
2537 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002538 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002539 p = stackbuf;
2540 }
2541 else {
2542 /* Overallocate on the heap, and give the excess back at the end. */
2543 nallocated = size * 4;
2544 if (nallocated / 4 != size) /* overflow! */
2545 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002546 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002547 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002548 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002549 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002550 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002551
Tim Peters602f7402002-04-27 18:03:26 +00002552 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002553 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002554
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002555 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002556 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002560 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002561 *p++ = (char)(0xc0 | (ch >> 6));
2562 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002563 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002564#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002565 /* Special case: check for high and low surrogate */
2566 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2567 Py_UCS4 ch2 = s[i];
2568 /* Combine the two surrogates to form a UCS4 value */
2569 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2570 i++;
2571
2572 /* Encode UCS4 Unicode ordinals */
2573 *p++ = (char)(0xf0 | (ch >> 18));
2574 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002575 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2576 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002577 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002578#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002579 Py_ssize_t newpos;
2580 PyObject *rep;
2581 Py_ssize_t repsize, k;
2582 rep = unicode_encode_call_errorhandler
2583 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2584 s, size, &exc, i-1, i, &newpos);
2585 if (!rep)
2586 goto error;
2587
2588 if (PyBytes_Check(rep))
2589 repsize = PyBytes_GET_SIZE(rep);
2590 else
2591 repsize = PyUnicode_GET_SIZE(rep);
2592
2593 if (repsize > 4) {
2594 Py_ssize_t offset;
2595
2596 if (result == NULL)
2597 offset = p - stackbuf;
2598 else
2599 offset = p - PyBytes_AS_STRING(result);
2600
2601 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2602 /* integer overflow */
2603 PyErr_NoMemory();
2604 goto error;
2605 }
2606 nallocated += repsize - 4;
2607 if (result != NULL) {
2608 if (_PyBytes_Resize(&result, nallocated) < 0)
2609 goto error;
2610 } else {
2611 result = PyBytes_FromStringAndSize(NULL, nallocated);
2612 if (result == NULL)
2613 goto error;
2614 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2615 }
2616 p = PyBytes_AS_STRING(result) + offset;
2617 }
2618
2619 if (PyBytes_Check(rep)) {
2620 char *prep = PyBytes_AS_STRING(rep);
2621 for(k = repsize; k > 0; k--)
2622 *p++ = *prep++;
2623 } else /* rep is unicode */ {
2624 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2625 Py_UNICODE c;
2626
2627 for(k=0; k<repsize; k++) {
2628 c = prep[k];
2629 if (0x80 <= c) {
2630 raise_encode_exception(&exc, "utf-8", s, size,
2631 i-1, i, "surrogates not allowed");
2632 goto error;
2633 }
2634 *p++ = (char)prep[k];
2635 }
2636 }
2637 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002638#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002639 }
Victor Stinner445a6232010-04-22 20:01:57 +00002640#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002641 } else if (ch < 0x10000) {
2642 *p++ = (char)(0xe0 | (ch >> 12));
2643 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2644 *p++ = (char)(0x80 | (ch & 0x3f));
2645 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002646 /* Encode UCS4 Unicode ordinals */
2647 *p++ = (char)(0xf0 | (ch >> 18));
2648 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2649 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2650 *p++ = (char)(0x80 | (ch & 0x3f));
2651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002653
Guido van Rossum98297ee2007-11-06 21:34:58 +00002654 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002655 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002656 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002657 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002658 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002659 }
2660 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002661 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002662 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002663 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002664 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002665 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002666 Py_XDECREF(errorHandler);
2667 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002668 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002669 error:
2670 Py_XDECREF(errorHandler);
2671 Py_XDECREF(exc);
2672 Py_XDECREF(result);
2673 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002674
Tim Peters602f7402002-04-27 18:03:26 +00002675#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676}
2677
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 if (!PyUnicode_Check(unicode)) {
2681 PyErr_BadArgument();
2682 return NULL;
2683 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002684 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 PyUnicode_GET_SIZE(unicode),
2686 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687}
2688
Walter Dörwald41980ca2007-08-16 21:55:45 +00002689/* --- UTF-32 Codec ------------------------------------------------------- */
2690
2691PyObject *
2692PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 Py_ssize_t size,
2694 const char *errors,
2695 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002696{
2697 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2698}
2699
2700PyObject *
2701PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002702 Py_ssize_t size,
2703 const char *errors,
2704 int *byteorder,
2705 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002706{
2707 const char *starts = s;
2708 Py_ssize_t startinpos;
2709 Py_ssize_t endinpos;
2710 Py_ssize_t outpos;
2711 PyUnicodeObject *unicode;
2712 Py_UNICODE *p;
2713#ifndef Py_UNICODE_WIDE
2714 int i, pairs;
2715#else
2716 const int pairs = 0;
2717#endif
2718 const unsigned char *q, *e;
2719 int bo = 0; /* assume native ordering by default */
2720 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721 /* Offsets from q for retrieving bytes in the right order. */
2722#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2723 int iorder[] = {0, 1, 2, 3};
2724#else
2725 int iorder[] = {3, 2, 1, 0};
2726#endif
2727 PyObject *errorHandler = NULL;
2728 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002729 /* On narrow builds we split characters outside the BMP into two
2730 codepoints => count how much extra space we need. */
2731#ifndef Py_UNICODE_WIDE
2732 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 if (((Py_UCS4 *)s)[i] >= 0x10000)
2734 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002735#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002736
2737 /* This might be one to much, because of a BOM */
2738 unicode = _PyUnicode_New((size+3)/4+pairs);
2739 if (!unicode)
2740 return NULL;
2741 if (size == 0)
2742 return (PyObject *)unicode;
2743
2744 /* Unpack UTF-32 encoded data */
2745 p = unicode->str;
2746 q = (unsigned char *)s;
2747 e = q + size;
2748
2749 if (byteorder)
2750 bo = *byteorder;
2751
2752 /* Check for BOM marks (U+FEFF) in the input and adjust current
2753 byte order setting accordingly. In native mode, the leading BOM
2754 mark is skipped, in all other modes, it is copied to the output
2755 stream as-is (giving a ZWNBSP character). */
2756 if (bo == 0) {
2757 if (size >= 4) {
2758 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002760#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 if (bom == 0x0000FEFF) {
2762 q += 4;
2763 bo = -1;
2764 }
2765 else if (bom == 0xFFFE0000) {
2766 q += 4;
2767 bo = 1;
2768 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002769#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 if (bom == 0x0000FEFF) {
2771 q += 4;
2772 bo = 1;
2773 }
2774 else if (bom == 0xFFFE0000) {
2775 q += 4;
2776 bo = -1;
2777 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780 }
2781
2782 if (bo == -1) {
2783 /* force LE */
2784 iorder[0] = 0;
2785 iorder[1] = 1;
2786 iorder[2] = 2;
2787 iorder[3] = 3;
2788 }
2789 else if (bo == 1) {
2790 /* force BE */
2791 iorder[0] = 3;
2792 iorder[1] = 2;
2793 iorder[2] = 1;
2794 iorder[3] = 0;
2795 }
2796
2797 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002798 Py_UCS4 ch;
2799 /* remaining bytes at the end? (size should be divisible by 4) */
2800 if (e-q<4) {
2801 if (consumed)
2802 break;
2803 errmsg = "truncated data";
2804 startinpos = ((const char *)q)-starts;
2805 endinpos = ((const char *)e)-starts;
2806 goto utf32Error;
2807 /* The remaining input chars are ignored if the callback
2808 chooses to skip the input */
2809 }
2810 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2811 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002812
Benjamin Peterson29060642009-01-31 22:14:21 +00002813 if (ch >= 0x110000)
2814 {
2815 errmsg = "codepoint not in range(0x110000)";
2816 startinpos = ((const char *)q)-starts;
2817 endinpos = startinpos+4;
2818 goto utf32Error;
2819 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002820#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002821 if (ch >= 0x10000)
2822 {
2823 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2824 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2825 }
2826 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002827#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 *p++ = ch;
2829 q += 4;
2830 continue;
2831 utf32Error:
2832 outpos = p-PyUnicode_AS_UNICODE(unicode);
2833 if (unicode_decode_call_errorhandler(
2834 errors, &errorHandler,
2835 "utf32", errmsg,
2836 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2837 &unicode, &outpos, &p))
2838 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002839 }
2840
2841 if (byteorder)
2842 *byteorder = bo;
2843
2844 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002845 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846
2847 /* Adjust length */
2848 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2849 goto onError;
2850
2851 Py_XDECREF(errorHandler);
2852 Py_XDECREF(exc);
2853 return (PyObject *)unicode;
2854
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002856 Py_DECREF(unicode);
2857 Py_XDECREF(errorHandler);
2858 Py_XDECREF(exc);
2859 return NULL;
2860}
2861
2862PyObject *
2863PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 Py_ssize_t size,
2865 const char *errors,
2866 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002867{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002868 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002869 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002870 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002871#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002872 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002873#else
2874 const int pairs = 0;
2875#endif
2876 /* Offsets from p for storing byte pairs in the right order. */
2877#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2878 int iorder[] = {0, 1, 2, 3};
2879#else
2880 int iorder[] = {3, 2, 1, 0};
2881#endif
2882
Benjamin Peterson29060642009-01-31 22:14:21 +00002883#define STORECHAR(CH) \
2884 do { \
2885 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2886 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2887 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2888 p[iorder[0]] = (CH) & 0xff; \
2889 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002890 } while(0)
2891
2892 /* In narrow builds we can output surrogate pairs as one codepoint,
2893 so we need less space. */
2894#ifndef Py_UNICODE_WIDE
2895 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2897 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2898 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002900 nsize = (size - pairs + (byteorder == 0));
2901 bytesize = nsize * 4;
2902 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002904 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002905 if (v == NULL)
2906 return NULL;
2907
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002908 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002909 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002911 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002912 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002913
2914 if (byteorder == -1) {
2915 /* force LE */
2916 iorder[0] = 0;
2917 iorder[1] = 1;
2918 iorder[2] = 2;
2919 iorder[3] = 3;
2920 }
2921 else if (byteorder == 1) {
2922 /* force BE */
2923 iorder[0] = 3;
2924 iorder[1] = 2;
2925 iorder[2] = 1;
2926 iorder[3] = 0;
2927 }
2928
2929 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002931#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2933 Py_UCS4 ch2 = *s;
2934 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2935 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2936 s++;
2937 size--;
2938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002940#endif
2941 STORECHAR(ch);
2942 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002943
2944 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002945 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002946#undef STORECHAR
2947}
2948
2949PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2950{
2951 if (!PyUnicode_Check(unicode)) {
2952 PyErr_BadArgument();
2953 return NULL;
2954 }
2955 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 PyUnicode_GET_SIZE(unicode),
2957 NULL,
2958 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002959}
2960
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961/* --- UTF-16 Codec ------------------------------------------------------- */
2962
Tim Peters772747b2001-08-09 22:21:55 +00002963PyObject *
2964PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 Py_ssize_t size,
2966 const char *errors,
2967 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968{
Walter Dörwald69652032004-09-07 20:24:22 +00002969 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2970}
2971
Antoine Pitrouab868312009-01-10 15:40:25 +00002972/* Two masks for fast checking of whether a C 'long' may contain
2973 UTF16-encoded surrogate characters. This is an efficient heuristic,
2974 assuming that non-surrogate characters with a code point >= 0x8000 are
2975 rare in most input.
2976 FAST_CHAR_MASK is used when the input is in native byte ordering,
2977 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002978*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002979#if (SIZEOF_LONG == 8)
2980# define FAST_CHAR_MASK 0x8000800080008000L
2981# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2982#elif (SIZEOF_LONG == 4)
2983# define FAST_CHAR_MASK 0x80008000L
2984# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2985#else
2986# error C 'long' size should be either 4 or 8!
2987#endif
2988
Walter Dörwald69652032004-09-07 20:24:22 +00002989PyObject *
2990PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 Py_ssize_t size,
2992 const char *errors,
2993 int *byteorder,
2994 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002995{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002997 Py_ssize_t startinpos;
2998 Py_ssize_t endinpos;
2999 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 PyUnicodeObject *unicode;
3001 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003002 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003003 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003004 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003005 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003006 /* Offsets from q for retrieving byte pairs in the right order. */
3007#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3008 int ihi = 1, ilo = 0;
3009#else
3010 int ihi = 0, ilo = 1;
3011#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 PyObject *errorHandler = NULL;
3013 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
3015 /* Note: size will always be longer than the resulting Unicode
3016 character count */
3017 unicode = _PyUnicode_New(size);
3018 if (!unicode)
3019 return NULL;
3020 if (size == 0)
3021 return (PyObject *)unicode;
3022
3023 /* Unpack UTF-16 encoded data */
3024 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003025 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003026 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003029 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003031 /* Check for BOM marks (U+FEFF) in the input and adjust current
3032 byte order setting accordingly. In native mode, the leading BOM
3033 mark is skipped, in all other modes, it is copied to the output
3034 stream as-is (giving a ZWNBSP character). */
3035 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003036 if (size >= 2) {
3037 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003038#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 if (bom == 0xFEFF) {
3040 q += 2;
3041 bo = -1;
3042 }
3043 else if (bom == 0xFFFE) {
3044 q += 2;
3045 bo = 1;
3046 }
Tim Petersced69f82003-09-16 20:30:58 +00003047#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 if (bom == 0xFEFF) {
3049 q += 2;
3050 bo = 1;
3051 }
3052 else if (bom == 0xFFFE) {
3053 q += 2;
3054 bo = -1;
3055 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003056#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059
Tim Peters772747b2001-08-09 22:21:55 +00003060 if (bo == -1) {
3061 /* force LE */
3062 ihi = 1;
3063 ilo = 0;
3064 }
3065 else if (bo == 1) {
3066 /* force BE */
3067 ihi = 0;
3068 ilo = 1;
3069 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003070#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3071 native_ordering = ilo < ihi;
3072#else
3073 native_ordering = ilo > ihi;
3074#endif
Tim Peters772747b2001-08-09 22:21:55 +00003075
Antoine Pitrouab868312009-01-10 15:40:25 +00003076 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003077 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003079 /* First check for possible aligned read of a C 'long'. Unaligned
3080 reads are more expensive, better to defer to another iteration. */
3081 if (!((size_t) q & LONG_PTR_MASK)) {
3082 /* Fast path for runs of non-surrogate chars. */
3083 register const unsigned char *_q = q;
3084 Py_UNICODE *_p = p;
3085 if (native_ordering) {
3086 /* Native ordering is simple: as long as the input cannot
3087 possibly contain a surrogate char, do an unrolled copy
3088 of several 16-bit code points to the target object.
3089 The non-surrogate check is done on several input bytes
3090 at a time (as many as a C 'long' can contain). */
3091 while (_q < aligned_end) {
3092 unsigned long data = * (unsigned long *) _q;
3093 if (data & FAST_CHAR_MASK)
3094 break;
3095 _p[0] = ((unsigned short *) _q)[0];
3096 _p[1] = ((unsigned short *) _q)[1];
3097#if (SIZEOF_LONG == 8)
3098 _p[2] = ((unsigned short *) _q)[2];
3099 _p[3] = ((unsigned short *) _q)[3];
3100#endif
3101 _q += SIZEOF_LONG;
3102 _p += SIZEOF_LONG / 2;
3103 }
3104 }
3105 else {
3106 /* Byteswapped ordering is similar, but we must decompose
3107 the copy bytewise, and take care of zero'ing out the
3108 upper bytes if the target object is in 32-bit units
3109 (that is, in UCS-4 builds). */
3110 while (_q < aligned_end) {
3111 unsigned long data = * (unsigned long *) _q;
3112 if (data & SWAPPED_FAST_CHAR_MASK)
3113 break;
3114 /* Zero upper bytes in UCS-4 builds */
3115#if (Py_UNICODE_SIZE > 2)
3116 _p[0] = 0;
3117 _p[1] = 0;
3118#if (SIZEOF_LONG == 8)
3119 _p[2] = 0;
3120 _p[3] = 0;
3121#endif
3122#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003123 /* Issue #4916; UCS-4 builds on big endian machines must
3124 fill the two last bytes of each 4-byte unit. */
3125#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3126# define OFF 2
3127#else
3128# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003129#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003130 ((unsigned char *) _p)[OFF + 1] = _q[0];
3131 ((unsigned char *) _p)[OFF + 0] = _q[1];
3132 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3133 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3134#if (SIZEOF_LONG == 8)
3135 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3136 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3137 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3138 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3139#endif
3140#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003141 _q += SIZEOF_LONG;
3142 _p += SIZEOF_LONG / 2;
3143 }
3144 }
3145 p = _p;
3146 q = _q;
3147 if (q >= e)
3148 break;
3149 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151
Benjamin Peterson14339b62009-01-31 16:36:08 +00003152 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003153
3154 if (ch < 0xD800 || ch > 0xDFFF) {
3155 *p++ = ch;
3156 continue;
3157 }
3158
3159 /* UTF-16 code pair: */
3160 if (q > e) {
3161 errmsg = "unexpected end of data";
3162 startinpos = (((const char *)q) - 2) - starts;
3163 endinpos = ((const char *)e) + 1 - starts;
3164 goto utf16Error;
3165 }
3166 if (0xD800 <= ch && ch <= 0xDBFF) {
3167 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3168 q += 2;
3169 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003170#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 *p++ = ch;
3172 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003173#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003175#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 continue;
3177 }
3178 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003179 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003180 startinpos = (((const char *)q)-4)-starts;
3181 endinpos = startinpos+2;
3182 goto utf16Error;
3183 }
3184
Benjamin Peterson14339b62009-01-31 16:36:08 +00003185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003186 errmsg = "illegal encoding";
3187 startinpos = (((const char *)q)-2)-starts;
3188 endinpos = startinpos+2;
3189 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 utf16Error:
3192 outpos = p - PyUnicode_AS_UNICODE(unicode);
3193 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003194 errors,
3195 &errorHandler,
3196 "utf16", errmsg,
3197 &starts,
3198 (const char **)&e,
3199 &startinpos,
3200 &endinpos,
3201 &exc,
3202 (const char **)&q,
3203 &unicode,
3204 &outpos,
3205 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003208 /* remaining byte at the end? (size should be even) */
3209 if (e == q) {
3210 if (!consumed) {
3211 errmsg = "truncated data";
3212 startinpos = ((const char *)q) - starts;
3213 endinpos = ((const char *)e) + 1 - starts;
3214 outpos = p - PyUnicode_AS_UNICODE(unicode);
3215 if (unicode_decode_call_errorhandler(
3216 errors,
3217 &errorHandler,
3218 "utf16", errmsg,
3219 &starts,
3220 (const char **)&e,
3221 &startinpos,
3222 &endinpos,
3223 &exc,
3224 (const char **)&q,
3225 &unicode,
3226 &outpos,
3227 &p))
3228 goto onError;
3229 /* The remaining input chars are ignored if the callback
3230 chooses to skip the input */
3231 }
3232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233
3234 if (byteorder)
3235 *byteorder = bo;
3236
Walter Dörwald69652032004-09-07 20:24:22 +00003237 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003239
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003241 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 goto onError;
3243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003244 Py_XDECREF(errorHandler);
3245 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 return (PyObject *)unicode;
3247
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 Py_XDECREF(errorHandler);
3251 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 return NULL;
3253}
3254
Antoine Pitrouab868312009-01-10 15:40:25 +00003255#undef FAST_CHAR_MASK
3256#undef SWAPPED_FAST_CHAR_MASK
3257
Tim Peters772747b2001-08-09 22:21:55 +00003258PyObject *
3259PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 Py_ssize_t size,
3261 const char *errors,
3262 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003264 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003265 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003266 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003267#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003268 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003269#else
3270 const int pairs = 0;
3271#endif
Tim Peters772747b2001-08-09 22:21:55 +00003272 /* Offsets from p for storing byte pairs in the right order. */
3273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3274 int ihi = 1, ilo = 0;
3275#else
3276 int ihi = 0, ilo = 1;
3277#endif
3278
Benjamin Peterson29060642009-01-31 22:14:21 +00003279#define STORECHAR(CH) \
3280 do { \
3281 p[ihi] = ((CH) >> 8) & 0xff; \
3282 p[ilo] = (CH) & 0xff; \
3283 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003284 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003286#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003287 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 if (s[i] >= 0x10000)
3289 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003290#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003291 /* 2 * (size + pairs + (byteorder == 0)) */
3292 if (size > PY_SSIZE_T_MAX ||
3293 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003295 nsize = size + pairs + (byteorder == 0);
3296 bytesize = nsize * 2;
3297 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003299 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 if (v == NULL)
3301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003303 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003306 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003307 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003308
3309 if (byteorder == -1) {
3310 /* force LE */
3311 ihi = 1;
3312 ilo = 0;
3313 }
3314 else if (byteorder == 1) {
3315 /* force BE */
3316 ihi = 0;
3317 ilo = 1;
3318 }
3319
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003320 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 Py_UNICODE ch = *s++;
3322 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003323#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 if (ch >= 0x10000) {
3325 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3326 ch = 0xD800 | ((ch-0x10000) >> 10);
3327 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003328#endif
Tim Peters772747b2001-08-09 22:21:55 +00003329 STORECHAR(ch);
3330 if (ch2)
3331 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003332 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003333
3334 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003335 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003336#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337}
3338
3339PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3340{
3341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
3343 return NULL;
3344 }
3345 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 PyUnicode_GET_SIZE(unicode),
3347 NULL,
3348 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349}
3350
3351/* --- Unicode Escape Codec ----------------------------------------------- */
3352
Fredrik Lundh06d12682001-01-24 07:59:11 +00003353static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003354
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003356 Py_ssize_t size,
3357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003360 Py_ssize_t startinpos;
3361 Py_ssize_t endinpos;
3362 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003367 char* message;
3368 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 PyObject *errorHandler = NULL;
3370 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003371
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 /* Escaped strings will always be longer than the resulting
3373 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 length after conversion to the true value.
3375 (but if the error callback returns a long replacement string
3376 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 v = _PyUnicode_New(size);
3378 if (v == NULL)
3379 goto onError;
3380 if (size == 0)
3381 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003385
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 while (s < end) {
3387 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003388 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390
3391 /* Non-escape characters are interpreted as Unicode ordinals */
3392 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003393 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 continue;
3395 }
3396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 /* \ - Escapes */
3399 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003400 c = *s++;
3401 if (s > end)
3402 c = '\0'; /* Invalid after \ */
3403 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 case '\n': break;
3407 case '\\': *p++ = '\\'; break;
3408 case '\'': *p++ = '\''; break;
3409 case '\"': *p++ = '\"'; break;
3410 case 'b': *p++ = '\b'; break;
3411 case 'f': *p++ = '\014'; break; /* FF */
3412 case 't': *p++ = '\t'; break;
3413 case 'n': *p++ = '\n'; break;
3414 case 'r': *p++ = '\r'; break;
3415 case 'v': *p++ = '\013'; break; /* VT */
3416 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3417
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 case '0': case '1': case '2': case '3':
3420 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003421 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003422 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003423 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003424 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003425 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003427 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 break;
3429
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 /* hex escapes */
3431 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003433 digits = 2;
3434 message = "truncated \\xXX escape";
3435 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436
Benjamin Peterson29060642009-01-31 22:14:21 +00003437 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003439 digits = 4;
3440 message = "truncated \\uXXXX escape";
3441 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003444 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003445 digits = 8;
3446 message = "truncated \\UXXXXXXXX escape";
3447 hexescape:
3448 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 outpos = p-PyUnicode_AS_UNICODE(v);
3450 if (s+digits>end) {
3451 endinpos = size;
3452 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 errors, &errorHandler,
3454 "unicodeescape", "end of string in escape sequence",
3455 &starts, &end, &startinpos, &endinpos, &exc, &s,
3456 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 goto onError;
3458 goto nextByte;
3459 }
3460 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003461 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003462 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 endinpos = (s+i+1)-starts;
3464 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003465 errors, &errorHandler,
3466 "unicodeescape", message,
3467 &starts, &end, &startinpos, &endinpos, &exc, &s,
3468 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003469 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003471 }
3472 chr = (chr<<4) & ~0xF;
3473 if (c >= '0' && c <= '9')
3474 chr += c - '0';
3475 else if (c >= 'a' && c <= 'f')
3476 chr += 10 + c - 'a';
3477 else
3478 chr += 10 + c - 'A';
3479 }
3480 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003481 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 /* _decoding_error will have already written into the
3483 target buffer. */
3484 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003485 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003486 /* when we get here, chr is a 32-bit unicode character */
3487 if (chr <= 0xffff)
3488 /* UCS-2 character */
3489 *p++ = (Py_UNICODE) chr;
3490 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003491 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003492 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003493#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003494 *p++ = chr;
3495#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003496 chr -= 0x10000L;
3497 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003498 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003499#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003500 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 endinpos = s-starts;
3502 outpos = p-PyUnicode_AS_UNICODE(v);
3503 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 errors, &errorHandler,
3505 "unicodeescape", "illegal Unicode character",
3506 &starts, &end, &startinpos, &endinpos, &exc, &s,
3507 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003508 goto onError;
3509 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003510 break;
3511
Benjamin Peterson29060642009-01-31 22:14:21 +00003512 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003513 case 'N':
3514 message = "malformed \\N character escape";
3515 if (ucnhash_CAPI == NULL) {
3516 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003517 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003518 if (ucnhash_CAPI == NULL)
3519 goto ucnhashError;
3520 }
3521 if (*s == '{') {
3522 const char *start = s+1;
3523 /* look for the closing brace */
3524 while (*s != '}' && s < end)
3525 s++;
3526 if (s > start && s < end && *s == '}') {
3527 /* found a name. look it up in the unicode database */
3528 message = "unknown Unicode character name";
3529 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003530 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003531 goto store;
3532 }
3533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 endinpos = s-starts;
3535 outpos = p-PyUnicode_AS_UNICODE(v);
3536 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 errors, &errorHandler,
3538 "unicodeescape", message,
3539 &starts, &end, &startinpos, &endinpos, &exc, &s,
3540 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003541 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003542 break;
3543
3544 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003545 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 message = "\\ at end of string";
3547 s--;
3548 endinpos = s-starts;
3549 outpos = p-PyUnicode_AS_UNICODE(v);
3550 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 errors, &errorHandler,
3552 "unicodeescape", message,
3553 &starts, &end, &startinpos, &endinpos, &exc, &s,
3554 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003555 goto onError;
3556 }
3557 else {
3558 *p++ = '\\';
3559 *p++ = (unsigned char)s[-1];
3560 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003561 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003566 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003568 Py_XDECREF(errorHandler);
3569 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003571
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003573 PyErr_SetString(
3574 PyExc_UnicodeError,
3575 "\\N escapes not supported (can't load unicodedata module)"
3576 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003577 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 Py_XDECREF(errorHandler);
3579 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003580 return NULL;
3581
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 Py_XDECREF(errorHandler);
3585 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 return NULL;
3587}
3588
3589/* Return a Unicode-Escape string version of the Unicode object.
3590
3591 If quotes is true, the string is enclosed in u"" or u'' quotes as
3592 appropriate.
3593
3594*/
3595
Thomas Wouters477c8d52006-05-27 19:21:47 +00003596Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 Py_ssize_t size,
3598 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003599{
3600 /* like wcschr, but doesn't stop at NULL characters */
3601
3602 while (size-- > 0) {
3603 if (*s == ch)
3604 return s;
3605 s++;
3606 }
3607
3608 return NULL;
3609}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003610
Walter Dörwald79e913e2007-05-12 11:08:06 +00003611static const char *hexdigits = "0123456789abcdef";
3612
3613PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003614 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003616 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003619#ifdef Py_UNICODE_WIDE
3620 const Py_ssize_t expandsize = 10;
3621#else
3622 const Py_ssize_t expandsize = 6;
3623#endif
3624
Thomas Wouters89f507f2006-12-13 04:49:30 +00003625 /* XXX(nnorwitz): rather than over-allocating, it would be
3626 better to choose a different scheme. Perhaps scan the
3627 first N-chars of the string and allocate based on that size.
3628 */
3629 /* Initial allocation is based on the longest-possible unichr
3630 escape.
3631
3632 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3633 unichr, so in this case it's the longest unichr escape. In
3634 narrow (UTF-16) builds this is five chars per source unichr
3635 since there are two unichrs in the surrogate pair, so in narrow
3636 (UTF-16) builds it's not the longest unichr escape.
3637
3638 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3639 so in the narrow (UTF-16) build case it's the longest unichr
3640 escape.
3641 */
3642
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003643 if (size == 0)
3644 return PyBytes_FromStringAndSize(NULL, 0);
3645
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003648
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003649 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 2
3651 + expandsize*size
3652 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (repr == NULL)
3654 return NULL;
3655
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003656 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 while (size-- > 0) {
3659 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003660
Walter Dörwald79e913e2007-05-12 11:08:06 +00003661 /* Escape backslashes */
3662 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 *p++ = '\\';
3664 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003665 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003666 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003667
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003668#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003669 /* Map 21-bit characters to '\U00xxxxxx' */
3670 else if (ch >= 0x10000) {
3671 *p++ = '\\';
3672 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003673 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3674 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3675 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3676 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3677 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3678 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3679 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3680 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003682 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003683#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3685 else if (ch >= 0xD800 && ch < 0xDC00) {
3686 Py_UNICODE ch2;
3687 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003688
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 ch2 = *s++;
3690 size--;
3691 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3692 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3693 *p++ = '\\';
3694 *p++ = 'U';
3695 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3696 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3697 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3698 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3699 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3700 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3701 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3702 *p++ = hexdigits[ucs & 0x0000000F];
3703 continue;
3704 }
3705 /* Fall through: isolated surrogates are copied as-is */
3706 s--;
3707 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003708 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003709#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003712 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 *p++ = '\\';
3714 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003715 *p++ = hexdigits[(ch >> 12) & 0x000F];
3716 *p++ = hexdigits[(ch >> 8) & 0x000F];
3717 *p++ = hexdigits[(ch >> 4) & 0x000F];
3718 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003720
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003721 /* Map special whitespace to '\t', \n', '\r' */
3722 else if (ch == '\t') {
3723 *p++ = '\\';
3724 *p++ = 't';
3725 }
3726 else if (ch == '\n') {
3727 *p++ = '\\';
3728 *p++ = 'n';
3729 }
3730 else if (ch == '\r') {
3731 *p++ = '\\';
3732 *p++ = 'r';
3733 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003734
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003735 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003736 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003738 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003739 *p++ = hexdigits[(ch >> 4) & 0x000F];
3740 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003741 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 /* Copy everything else as-is */
3744 else
3745 *p++ = (char) ch;
3746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003748 assert(p - PyBytes_AS_STRING(repr) > 0);
3749 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3750 return NULL;
3751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752}
3753
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003754PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003756 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3760 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003761 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3762 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003763 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764}
3765
3766/* --- Raw Unicode Escape Codec ------------------------------------------- */
3767
3768PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 Py_ssize_t size,
3770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003773 Py_ssize_t startinpos;
3774 Py_ssize_t endinpos;
3775 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 const char *end;
3779 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 PyObject *errorHandler = NULL;
3781 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003782
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 /* Escaped strings will always be longer than the resulting
3784 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 length after conversion to the true value. (But decoding error
3786 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 v = _PyUnicode_New(size);
3788 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003789 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 end = s + size;
3794 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 unsigned char c;
3796 Py_UCS4 x;
3797 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003798 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 /* Non-escape characters are interpreted as Unicode ordinals */
3801 if (*s != '\\') {
3802 *p++ = (unsigned char)*s++;
3803 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 startinpos = s-starts;
3806
3807 /* \u-escapes are only interpreted iff the number of leading
3808 backslashes if odd */
3809 bs = s;
3810 for (;s < end;) {
3811 if (*s != '\\')
3812 break;
3813 *p++ = (unsigned char)*s++;
3814 }
3815 if (((s - bs) & 1) == 0 ||
3816 s >= end ||
3817 (*s != 'u' && *s != 'U')) {
3818 continue;
3819 }
3820 p--;
3821 count = *s=='u' ? 4 : 8;
3822 s++;
3823
3824 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3825 outpos = p-PyUnicode_AS_UNICODE(v);
3826 for (x = 0, i = 0; i < count; ++i, ++s) {
3827 c = (unsigned char)*s;
3828 if (!ISXDIGIT(c)) {
3829 endinpos = s-starts;
3830 if (unicode_decode_call_errorhandler(
3831 errors, &errorHandler,
3832 "rawunicodeescape", "truncated \\uXXXX",
3833 &starts, &end, &startinpos, &endinpos, &exc, &s,
3834 &v, &outpos, &p))
3835 goto onError;
3836 goto nextByte;
3837 }
3838 x = (x<<4) & ~0xF;
3839 if (c >= '0' && c <= '9')
3840 x += c - '0';
3841 else if (c >= 'a' && c <= 'f')
3842 x += 10 + c - 'a';
3843 else
3844 x += 10 + c - 'A';
3845 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003846 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003847 /* UCS-2 character */
3848 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003849 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 /* UCS-4 character. Either store directly, or as
3851 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003852#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003854#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 x -= 0x10000L;
3856 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3857 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003858#endif
3859 } else {
3860 endinpos = s-starts;
3861 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003862 if (unicode_decode_call_errorhandler(
3863 errors, &errorHandler,
3864 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 &starts, &end, &startinpos, &endinpos, &exc, &s,
3866 &v, &outpos, &p))
3867 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 nextByte:
3870 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003872 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003873 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 Py_XDECREF(errorHandler);
3875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003877
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 Py_XDECREF(errorHandler);
3881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 return NULL;
3883}
3884
3885PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003888 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 char *p;
3890 char *q;
3891
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003892#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003893 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003894#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003895 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003896#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003897
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003898 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003900
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003901 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 if (repr == NULL)
3903 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003904 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003905 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003907 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 while (size-- > 0) {
3909 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003910#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 /* Map 32-bit characters to '\Uxxxxxxxx' */
3912 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003913 *p++ = '\\';
3914 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003915 *p++ = hexdigits[(ch >> 28) & 0xf];
3916 *p++ = hexdigits[(ch >> 24) & 0xf];
3917 *p++ = hexdigits[(ch >> 20) & 0xf];
3918 *p++ = hexdigits[(ch >> 16) & 0xf];
3919 *p++ = hexdigits[(ch >> 12) & 0xf];
3920 *p++ = hexdigits[(ch >> 8) & 0xf];
3921 *p++ = hexdigits[(ch >> 4) & 0xf];
3922 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003923 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003924 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003925#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3927 if (ch >= 0xD800 && ch < 0xDC00) {
3928 Py_UNICODE ch2;
3929 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003930
Benjamin Peterson29060642009-01-31 22:14:21 +00003931 ch2 = *s++;
3932 size--;
3933 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3934 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3935 *p++ = '\\';
3936 *p++ = 'U';
3937 *p++ = hexdigits[(ucs >> 28) & 0xf];
3938 *p++ = hexdigits[(ucs >> 24) & 0xf];
3939 *p++ = hexdigits[(ucs >> 20) & 0xf];
3940 *p++ = hexdigits[(ucs >> 16) & 0xf];
3941 *p++ = hexdigits[(ucs >> 12) & 0xf];
3942 *p++ = hexdigits[(ucs >> 8) & 0xf];
3943 *p++ = hexdigits[(ucs >> 4) & 0xf];
3944 *p++ = hexdigits[ucs & 0xf];
3945 continue;
3946 }
3947 /* Fall through: isolated surrogates are copied as-is */
3948 s--;
3949 size++;
3950 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003951#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 /* Map 16-bit characters to '\uxxxx' */
3953 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 *p++ = '\\';
3955 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003956 *p++ = hexdigits[(ch >> 12) & 0xf];
3957 *p++ = hexdigits[(ch >> 8) & 0xf];
3958 *p++ = hexdigits[(ch >> 4) & 0xf];
3959 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 /* Copy everything else as-is */
3962 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 *p++ = (char) ch;
3964 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003965 size = p - q;
3966
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003967 assert(size > 0);
3968 if (_PyBytes_Resize(&repr, size) < 0)
3969 return NULL;
3970 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971}
3972
3973PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3974{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003975 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003977 PyErr_BadArgument();
3978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003980 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3981 PyUnicode_GET_SIZE(unicode));
3982
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003983 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984}
3985
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003986/* --- Unicode Internal Codec ------------------------------------------- */
3987
3988PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 Py_ssize_t size,
3990 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003991{
3992 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003993 Py_ssize_t startinpos;
3994 Py_ssize_t endinpos;
3995 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003996 PyUnicodeObject *v;
3997 Py_UNICODE *p;
3998 const char *end;
3999 const char *reason;
4000 PyObject *errorHandler = NULL;
4001 PyObject *exc = NULL;
4002
Neal Norwitzd43069c2006-01-08 01:12:10 +00004003#ifdef Py_UNICODE_WIDE
4004 Py_UNICODE unimax = PyUnicode_GetMax();
4005#endif
4006
Thomas Wouters89f507f2006-12-13 04:49:30 +00004007 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004008 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4009 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004011 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004012 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004013 p = PyUnicode_AS_UNICODE(v);
4014 end = s + size;
4015
4016 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004017 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004018 /* We have to sanity check the raw data, otherwise doom looms for
4019 some malformed UCS-4 data. */
4020 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004021#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004022 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004023#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004024 end-s < Py_UNICODE_SIZE
4025 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004027 startinpos = s - starts;
4028 if (end-s < Py_UNICODE_SIZE) {
4029 endinpos = end-starts;
4030 reason = "truncated input";
4031 }
4032 else {
4033 endinpos = s - starts + Py_UNICODE_SIZE;
4034 reason = "illegal code point (> 0x10FFFF)";
4035 }
4036 outpos = p - PyUnicode_AS_UNICODE(v);
4037 if (unicode_decode_call_errorhandler(
4038 errors, &errorHandler,
4039 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004040 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004041 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004042 goto onError;
4043 }
4044 }
4045 else {
4046 p++;
4047 s += Py_UNICODE_SIZE;
4048 }
4049 }
4050
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004051 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004052 goto onError;
4053 Py_XDECREF(errorHandler);
4054 Py_XDECREF(exc);
4055 return (PyObject *)v;
4056
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004058 Py_XDECREF(v);
4059 Py_XDECREF(errorHandler);
4060 Py_XDECREF(exc);
4061 return NULL;
4062}
4063
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064/* --- Latin-1 Codec ------------------------------------------------------ */
4065
4066PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 Py_ssize_t size,
4068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069{
4070 PyUnicodeObject *v;
4071 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004072 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004073
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004075 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 Py_UNICODE r = *(unsigned char*)s;
4077 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004078 }
4079
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 v = _PyUnicode_New(size);
4081 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004086 e = s + size;
4087 /* Unrolling the copy makes it much faster by reducing the looping
4088 overhead. This is similar to what many memcpy() implementations do. */
4089 unrolled_end = e - 4;
4090 while (s < unrolled_end) {
4091 p[0] = (unsigned char) s[0];
4092 p[1] = (unsigned char) s[1];
4093 p[2] = (unsigned char) s[2];
4094 p[3] = (unsigned char) s[3];
4095 s += 4;
4096 p += 4;
4097 }
4098 while (s < e)
4099 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004101
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 Py_XDECREF(v);
4104 return NULL;
4105}
4106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107/* create or adjust a UnicodeEncodeError */
4108static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 const char *encoding,
4110 const Py_UNICODE *unicode, Py_ssize_t size,
4111 Py_ssize_t startpos, Py_ssize_t endpos,
4112 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 *exceptionObject = PyUnicodeEncodeError_Create(
4116 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 }
4118 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4120 goto onError;
4121 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4122 goto onError;
4123 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4124 goto onError;
4125 return;
4126 onError:
4127 Py_DECREF(*exceptionObject);
4128 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 }
4130}
4131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132/* raises a UnicodeEncodeError */
4133static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 const char *encoding,
4135 const Py_UNICODE *unicode, Py_ssize_t size,
4136 Py_ssize_t startpos, Py_ssize_t endpos,
4137 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138{
4139 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143}
4144
4145/* error handling callback helper:
4146 build arguments, call the callback and check the arguments,
4147 put the result into newpos and return the replacement string, which
4148 has to be freed by the caller */
4149static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 PyObject **errorHandler,
4151 const char *encoding, const char *reason,
4152 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4153 Py_ssize_t startpos, Py_ssize_t endpos,
4154 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004156 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157
4158 PyObject *restuple;
4159 PyObject *resunicode;
4160
4161 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 }
4166
4167 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171
4172 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004177 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 Py_DECREF(restuple);
4179 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004181 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 &resunicode, newpos)) {
4183 Py_DECREF(restuple);
4184 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004186 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4187 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4188 Py_DECREF(restuple);
4189 return NULL;
4190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004193 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4195 Py_DECREF(restuple);
4196 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004197 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_INCREF(resunicode);
4199 Py_DECREF(restuple);
4200 return resunicode;
4201}
4202
4203static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 Py_ssize_t size,
4205 const char *errors,
4206 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207{
4208 /* output object */
4209 PyObject *res;
4210 /* pointers to the beginning and end+1 of input */
4211 const Py_UNICODE *startp = p;
4212 const Py_UNICODE *endp = p + size;
4213 /* pointer to the beginning of the unencodable characters */
4214 /* const Py_UNICODE *badp = NULL; */
4215 /* pointer into the output */
4216 char *str;
4217 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004218 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004219 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4220 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 PyObject *errorHandler = NULL;
4222 PyObject *exc = NULL;
4223 /* the following variable is used for caching string comparisons
4224 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4225 int known_errorHandler = -1;
4226
4227 /* allocate enough for a simple encoding without
4228 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004229 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004230 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004231 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004233 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004234 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 ressize = size;
4236
4237 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 /* can we encode this? */
4241 if (c<limit) {
4242 /* no overflow check, because we know that the space is enough */
4243 *str++ = (char)c;
4244 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004245 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 else {
4247 Py_ssize_t unicodepos = p-startp;
4248 Py_ssize_t requiredsize;
4249 PyObject *repunicode;
4250 Py_ssize_t repsize;
4251 Py_ssize_t newpos;
4252 Py_ssize_t respos;
4253 Py_UNICODE *uni2;
4254 /* startpos for collecting unencodable chars */
4255 const Py_UNICODE *collstart = p;
4256 const Py_UNICODE *collend = p;
4257 /* find all unecodable characters */
4258 while ((collend < endp) && ((*collend)>=limit))
4259 ++collend;
4260 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4261 if (known_errorHandler==-1) {
4262 if ((errors==NULL) || (!strcmp(errors, "strict")))
4263 known_errorHandler = 1;
4264 else if (!strcmp(errors, "replace"))
4265 known_errorHandler = 2;
4266 else if (!strcmp(errors, "ignore"))
4267 known_errorHandler = 3;
4268 else if (!strcmp(errors, "xmlcharrefreplace"))
4269 known_errorHandler = 4;
4270 else
4271 known_errorHandler = 0;
4272 }
4273 switch (known_errorHandler) {
4274 case 1: /* strict */
4275 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4276 goto onError;
4277 case 2: /* replace */
4278 while (collstart++<collend)
4279 *str++ = '?'; /* fall through */
4280 case 3: /* ignore */
4281 p = collend;
4282 break;
4283 case 4: /* xmlcharrefreplace */
4284 respos = str - PyBytes_AS_STRING(res);
4285 /* determine replacement size (temporarily (mis)uses p) */
4286 for (p = collstart, repsize = 0; p < collend; ++p) {
4287 if (*p<10)
4288 repsize += 2+1+1;
4289 else if (*p<100)
4290 repsize += 2+2+1;
4291 else if (*p<1000)
4292 repsize += 2+3+1;
4293 else if (*p<10000)
4294 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004295#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 else
4297 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004298#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 else if (*p<100000)
4300 repsize += 2+5+1;
4301 else if (*p<1000000)
4302 repsize += 2+6+1;
4303 else
4304 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004305#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 }
4307 requiredsize = respos+repsize+(endp-collend);
4308 if (requiredsize > ressize) {
4309 if (requiredsize<2*ressize)
4310 requiredsize = 2*ressize;
4311 if (_PyBytes_Resize(&res, requiredsize))
4312 goto onError;
4313 str = PyBytes_AS_STRING(res) + respos;
4314 ressize = requiredsize;
4315 }
4316 /* generate replacement (temporarily (mis)uses p) */
4317 for (p = collstart; p < collend; ++p) {
4318 str += sprintf(str, "&#%d;", (int)*p);
4319 }
4320 p = collend;
4321 break;
4322 default:
4323 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4324 encoding, reason, startp, size, &exc,
4325 collstart-startp, collend-startp, &newpos);
4326 if (repunicode == NULL)
4327 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004328 if (PyBytes_Check(repunicode)) {
4329 /* Directly copy bytes result to output. */
4330 repsize = PyBytes_Size(repunicode);
4331 if (repsize > 1) {
4332 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004333 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004334 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4335 Py_DECREF(repunicode);
4336 goto onError;
4337 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004338 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004339 ressize += repsize-1;
4340 }
4341 memcpy(str, PyBytes_AsString(repunicode), repsize);
4342 str += repsize;
4343 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004344 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004345 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 /* need more space? (at least enough for what we
4348 have+the replacement+the rest of the string, so
4349 we won't have to check space for encodable characters) */
4350 respos = str - PyBytes_AS_STRING(res);
4351 repsize = PyUnicode_GET_SIZE(repunicode);
4352 requiredsize = respos+repsize+(endp-collend);
4353 if (requiredsize > ressize) {
4354 if (requiredsize<2*ressize)
4355 requiredsize = 2*ressize;
4356 if (_PyBytes_Resize(&res, requiredsize)) {
4357 Py_DECREF(repunicode);
4358 goto onError;
4359 }
4360 str = PyBytes_AS_STRING(res) + respos;
4361 ressize = requiredsize;
4362 }
4363 /* check if there is anything unencodable in the replacement
4364 and copy it to the output */
4365 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4366 c = *uni2;
4367 if (c >= limit) {
4368 raise_encode_exception(&exc, encoding, startp, size,
4369 unicodepos, unicodepos+1, reason);
4370 Py_DECREF(repunicode);
4371 goto onError;
4372 }
4373 *str = (char)c;
4374 }
4375 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004376 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004378 }
4379 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004380 /* Resize if we allocated to much */
4381 size = str - PyBytes_AS_STRING(res);
4382 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004383 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004384 if (_PyBytes_Resize(&res, size) < 0)
4385 goto onError;
4386 }
4387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 Py_XDECREF(errorHandler);
4389 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004390 return res;
4391
4392 onError:
4393 Py_XDECREF(res);
4394 Py_XDECREF(errorHandler);
4395 Py_XDECREF(exc);
4396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397}
4398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 Py_ssize_t size,
4401 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404}
4405
4406PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4407{
4408 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 PyErr_BadArgument();
4410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 }
4412 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 PyUnicode_GET_SIZE(unicode),
4414 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415}
4416
4417/* --- 7-bit ASCII Codec -------------------------------------------------- */
4418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 Py_ssize_t size,
4421 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 PyUnicodeObject *v;
4425 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t startinpos;
4427 Py_ssize_t endinpos;
4428 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 const char *e;
4430 PyObject *errorHandler = NULL;
4431 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004434 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 Py_UNICODE r = *(unsigned char*)s;
4436 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004437 }
Tim Petersced69f82003-09-16 20:30:58 +00004438
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 v = _PyUnicode_New(size);
4440 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 e = s + size;
4446 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 register unsigned char c = (unsigned char)*s;
4448 if (c < 128) {
4449 *p++ = c;
4450 ++s;
4451 }
4452 else {
4453 startinpos = s-starts;
4454 endinpos = startinpos + 1;
4455 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4456 if (unicode_decode_call_errorhandler(
4457 errors, &errorHandler,
4458 "ascii", "ordinal not in range(128)",
4459 &starts, &e, &startinpos, &endinpos, &exc, &s,
4460 &v, &outpos, &p))
4461 goto onError;
4462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004464 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4466 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 Py_XDECREF(errorHandler);
4468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_XDECREF(errorHandler);
4474 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 return NULL;
4476}
4477
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 Py_ssize_t size,
4480 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483}
4484
4485PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4486{
4487 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 PyErr_BadArgument();
4489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 }
4491 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 PyUnicode_GET_SIZE(unicode),
4493 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494}
4495
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004496#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004497
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004498/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004499
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004500#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004501#define NEED_RETRY
4502#endif
4503
4504/* XXX This code is limited to "true" double-byte encodings, as
4505 a) it assumes an incomplete character consists of a single byte, and
4506 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004508
4509static int is_dbcs_lead_byte(const char *s, int offset)
4510{
4511 const char *curr = s + offset;
4512
4513 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 const char *prev = CharPrev(s, curr);
4515 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004516 }
4517 return 0;
4518}
4519
4520/*
4521 * Decode MBCS string into unicode object. If 'final' is set, converts
4522 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4523 */
4524static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 const char *s, /* MBCS string */
4526 int size, /* sizeof MBCS string */
4527 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528{
4529 Py_UNICODE *p;
4530 Py_ssize_t n = 0;
4531 int usize = 0;
4532
4533 assert(size >= 0);
4534
4535 /* Skip trailing lead-byte unless 'final' is set */
4536 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004538
4539 /* First get the size of the result */
4540 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4542 if (usize == 0) {
4543 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4544 return -1;
4545 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546 }
4547
4548 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 /* Create unicode object */
4550 *v = _PyUnicode_New(usize);
4551 if (*v == NULL)
4552 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004553 }
4554 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 /* Extend unicode object */
4556 n = PyUnicode_GET_SIZE(*v);
4557 if (_PyUnicode_Resize(v, n + usize) < 0)
4558 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004559 }
4560
4561 /* Do the conversion */
4562 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 p = PyUnicode_AS_UNICODE(*v) + n;
4564 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4565 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4566 return -1;
4567 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004568 }
4569
4570 return size;
4571}
4572
4573PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 Py_ssize_t size,
4575 const char *errors,
4576 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004577{
4578 PyUnicodeObject *v = NULL;
4579 int done;
4580
4581 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004583
4584#ifdef NEED_RETRY
4585 retry:
4586 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004588 else
4589#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004591
4592 if (done < 0) {
4593 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004595 }
4596
4597 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599
4600#ifdef NEED_RETRY
4601 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 s += done;
4603 size -= done;
4604 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605 }
4606#endif
4607
4608 return (PyObject *)v;
4609}
4610
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004611PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 Py_ssize_t size,
4613 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004614{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004615 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4616}
4617
4618/*
4619 * Convert unicode into string object (MBCS).
4620 * Returns 0 if succeed, -1 otherwise.
4621 */
4622static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 const Py_UNICODE *p, /* unicode */
4624 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004625{
4626 int mbcssize = 0;
4627 Py_ssize_t n = 0;
4628
4629 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004630
4631 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4634 if (mbcssize == 0) {
4635 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4636 return -1;
4637 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004638 }
4639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004640 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 /* Create string object */
4642 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4643 if (*repr == NULL)
4644 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004645 }
4646 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 /* Extend string object */
4648 n = PyBytes_Size(*repr);
4649 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4650 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004651 }
4652
4653 /* Do the conversion */
4654 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 char *s = PyBytes_AS_STRING(*repr) + n;
4656 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4657 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4658 return -1;
4659 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004660 }
4661
4662 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004663}
4664
4665PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 Py_ssize_t size,
4667 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004668{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004669 PyObject *repr = NULL;
4670 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004671
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004674 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004676 else
4677#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004679
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004680 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 Py_XDECREF(repr);
4682 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004683 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004684
4685#ifdef NEED_RETRY
4686 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 p += INT_MAX;
4688 size -= INT_MAX;
4689 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004690 }
4691#endif
4692
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004693 return repr;
4694}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004695
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004696PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4697{
4698 if (!PyUnicode_Check(unicode)) {
4699 PyErr_BadArgument();
4700 return NULL;
4701 }
4702 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 PyUnicode_GET_SIZE(unicode),
4704 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004705}
4706
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004707#undef NEED_RETRY
4708
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004709#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004710
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711/* --- Character Mapping Codec -------------------------------------------- */
4712
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 Py_ssize_t size,
4715 PyObject *mapping,
4716 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t startinpos;
4720 Py_ssize_t endinpos;
4721 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 PyUnicodeObject *v;
4724 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004725 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 PyObject *errorHandler = NULL;
4727 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004728 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 /* Default to Latin-1 */
4732 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
4735 v = _PyUnicode_New(size);
4736 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004742 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 mapstring = PyUnicode_AS_UNICODE(mapping);
4744 maplen = PyUnicode_GET_SIZE(mapping);
4745 while (s < e) {
4746 unsigned char ch = *s;
4747 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 if (ch < maplen)
4750 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 if (x == 0xfffe) {
4753 /* undefined mapping */
4754 outpos = p-PyUnicode_AS_UNICODE(v);
4755 startinpos = s-starts;
4756 endinpos = startinpos+1;
4757 if (unicode_decode_call_errorhandler(
4758 errors, &errorHandler,
4759 "charmap", "character maps to <undefined>",
4760 &starts, &e, &startinpos, &endinpos, &exc, &s,
4761 &v, &outpos, &p)) {
4762 goto onError;
4763 }
4764 continue;
4765 }
4766 *p++ = x;
4767 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004768 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004769 }
4770 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 while (s < e) {
4772 unsigned char ch = *s;
4773 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004774
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4776 w = PyLong_FromLong((long)ch);
4777 if (w == NULL)
4778 goto onError;
4779 x = PyObject_GetItem(mapping, w);
4780 Py_DECREF(w);
4781 if (x == NULL) {
4782 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4783 /* No mapping found means: mapping is undefined. */
4784 PyErr_Clear();
4785 x = Py_None;
4786 Py_INCREF(x);
4787 } else
4788 goto onError;
4789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004790
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 /* Apply mapping */
4792 if (PyLong_Check(x)) {
4793 long value = PyLong_AS_LONG(x);
4794 if (value < 0 || value > 65535) {
4795 PyErr_SetString(PyExc_TypeError,
4796 "character mapping must be in range(65536)");
4797 Py_DECREF(x);
4798 goto onError;
4799 }
4800 *p++ = (Py_UNICODE)value;
4801 }
4802 else if (x == Py_None) {
4803 /* undefined mapping */
4804 outpos = p-PyUnicode_AS_UNICODE(v);
4805 startinpos = s-starts;
4806 endinpos = startinpos+1;
4807 if (unicode_decode_call_errorhandler(
4808 errors, &errorHandler,
4809 "charmap", "character maps to <undefined>",
4810 &starts, &e, &startinpos, &endinpos, &exc, &s,
4811 &v, &outpos, &p)) {
4812 Py_DECREF(x);
4813 goto onError;
4814 }
4815 Py_DECREF(x);
4816 continue;
4817 }
4818 else if (PyUnicode_Check(x)) {
4819 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004820
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 if (targetsize == 1)
4822 /* 1-1 mapping */
4823 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004824
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 else if (targetsize > 1) {
4826 /* 1-n mapping */
4827 if (targetsize > extrachars) {
4828 /* resize first */
4829 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4830 Py_ssize_t needed = (targetsize - extrachars) + \
4831 (targetsize << 2);
4832 extrachars += needed;
4833 /* XXX overflow detection missing */
4834 if (_PyUnicode_Resize(&v,
4835 PyUnicode_GET_SIZE(v) + needed) < 0) {
4836 Py_DECREF(x);
4837 goto onError;
4838 }
4839 p = PyUnicode_AS_UNICODE(v) + oldpos;
4840 }
4841 Py_UNICODE_COPY(p,
4842 PyUnicode_AS_UNICODE(x),
4843 targetsize);
4844 p += targetsize;
4845 extrachars -= targetsize;
4846 }
4847 /* 1-0 mapping: skip the character */
4848 }
4849 else {
4850 /* wrong return value */
4851 PyErr_SetString(PyExc_TypeError,
4852 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004853 Py_DECREF(x);
4854 goto onError;
4855 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 Py_DECREF(x);
4857 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 }
4860 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 Py_XDECREF(errorHandler);
4864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004866
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 Py_XDECREF(errorHandler);
4869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 Py_XDECREF(v);
4871 return NULL;
4872}
4873
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004874/* Charmap encoding: the lookup table */
4875
4876struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 PyObject_HEAD
4878 unsigned char level1[32];
4879 int count2, count3;
4880 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004881};
4882
4883static PyObject*
4884encoding_map_size(PyObject *obj, PyObject* args)
4885{
4886 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004887 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004889}
4890
4891static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004892 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 PyDoc_STR("Return the size (in bytes) of this object") },
4894 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004895};
4896
4897static void
4898encoding_map_dealloc(PyObject* o)
4899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004900 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004901}
4902
4903static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004904 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 "EncodingMap", /*tp_name*/
4906 sizeof(struct encoding_map), /*tp_basicsize*/
4907 0, /*tp_itemsize*/
4908 /* methods */
4909 encoding_map_dealloc, /*tp_dealloc*/
4910 0, /*tp_print*/
4911 0, /*tp_getattr*/
4912 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004913 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 0, /*tp_repr*/
4915 0, /*tp_as_number*/
4916 0, /*tp_as_sequence*/
4917 0, /*tp_as_mapping*/
4918 0, /*tp_hash*/
4919 0, /*tp_call*/
4920 0, /*tp_str*/
4921 0, /*tp_getattro*/
4922 0, /*tp_setattro*/
4923 0, /*tp_as_buffer*/
4924 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4925 0, /*tp_doc*/
4926 0, /*tp_traverse*/
4927 0, /*tp_clear*/
4928 0, /*tp_richcompare*/
4929 0, /*tp_weaklistoffset*/
4930 0, /*tp_iter*/
4931 0, /*tp_iternext*/
4932 encoding_map_methods, /*tp_methods*/
4933 0, /*tp_members*/
4934 0, /*tp_getset*/
4935 0, /*tp_base*/
4936 0, /*tp_dict*/
4937 0, /*tp_descr_get*/
4938 0, /*tp_descr_set*/
4939 0, /*tp_dictoffset*/
4940 0, /*tp_init*/
4941 0, /*tp_alloc*/
4942 0, /*tp_new*/
4943 0, /*tp_free*/
4944 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004945};
4946
4947PyObject*
4948PyUnicode_BuildEncodingMap(PyObject* string)
4949{
4950 Py_UNICODE *decode;
4951 PyObject *result;
4952 struct encoding_map *mresult;
4953 int i;
4954 int need_dict = 0;
4955 unsigned char level1[32];
4956 unsigned char level2[512];
4957 unsigned char *mlevel1, *mlevel2, *mlevel3;
4958 int count2 = 0, count3 = 0;
4959
4960 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4961 PyErr_BadArgument();
4962 return NULL;
4963 }
4964 decode = PyUnicode_AS_UNICODE(string);
4965 memset(level1, 0xFF, sizeof level1);
4966 memset(level2, 0xFF, sizeof level2);
4967
4968 /* If there isn't a one-to-one mapping of NULL to \0,
4969 or if there are non-BMP characters, we need to use
4970 a mapping dictionary. */
4971 if (decode[0] != 0)
4972 need_dict = 1;
4973 for (i = 1; i < 256; i++) {
4974 int l1, l2;
4975 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004976#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004977 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004978#endif
4979 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004980 need_dict = 1;
4981 break;
4982 }
4983 if (decode[i] == 0xFFFE)
4984 /* unmapped character */
4985 continue;
4986 l1 = decode[i] >> 11;
4987 l2 = decode[i] >> 7;
4988 if (level1[l1] == 0xFF)
4989 level1[l1] = count2++;
4990 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004991 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004992 }
4993
4994 if (count2 >= 0xFF || count3 >= 0xFF)
4995 need_dict = 1;
4996
4997 if (need_dict) {
4998 PyObject *result = PyDict_New();
4999 PyObject *key, *value;
5000 if (!result)
5001 return NULL;
5002 for (i = 0; i < 256; i++) {
5003 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005004 key = PyLong_FromLong(decode[i]);
5005 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005006 if (!key || !value)
5007 goto failed1;
5008 if (PyDict_SetItem(result, key, value) == -1)
5009 goto failed1;
5010 Py_DECREF(key);
5011 Py_DECREF(value);
5012 }
5013 return result;
5014 failed1:
5015 Py_XDECREF(key);
5016 Py_XDECREF(value);
5017 Py_DECREF(result);
5018 return NULL;
5019 }
5020
5021 /* Create a three-level trie */
5022 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5023 16*count2 + 128*count3 - 1);
5024 if (!result)
5025 return PyErr_NoMemory();
5026 PyObject_Init(result, &EncodingMapType);
5027 mresult = (struct encoding_map*)result;
5028 mresult->count2 = count2;
5029 mresult->count3 = count3;
5030 mlevel1 = mresult->level1;
5031 mlevel2 = mresult->level23;
5032 mlevel3 = mresult->level23 + 16*count2;
5033 memcpy(mlevel1, level1, 32);
5034 memset(mlevel2, 0xFF, 16*count2);
5035 memset(mlevel3, 0, 128*count3);
5036 count3 = 0;
5037 for (i = 1; i < 256; i++) {
5038 int o1, o2, o3, i2, i3;
5039 if (decode[i] == 0xFFFE)
5040 /* unmapped character */
5041 continue;
5042 o1 = decode[i]>>11;
5043 o2 = (decode[i]>>7) & 0xF;
5044 i2 = 16*mlevel1[o1] + o2;
5045 if (mlevel2[i2] == 0xFF)
5046 mlevel2[i2] = count3++;
5047 o3 = decode[i] & 0x7F;
5048 i3 = 128*mlevel2[i2] + o3;
5049 mlevel3[i3] = i;
5050 }
5051 return result;
5052}
5053
5054static int
5055encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5056{
5057 struct encoding_map *map = (struct encoding_map*)mapping;
5058 int l1 = c>>11;
5059 int l2 = (c>>7) & 0xF;
5060 int l3 = c & 0x7F;
5061 int i;
5062
5063#ifdef Py_UNICODE_WIDE
5064 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005066 }
5067#endif
5068 if (c == 0)
5069 return 0;
5070 /* level 1*/
5071 i = map->level1[l1];
5072 if (i == 0xFF) {
5073 return -1;
5074 }
5075 /* level 2*/
5076 i = map->level23[16*i+l2];
5077 if (i == 0xFF) {
5078 return -1;
5079 }
5080 /* level 3 */
5081 i = map->level23[16*map->count2 + 128*i + l3];
5082 if (i == 0) {
5083 return -1;
5084 }
5085 return i;
5086}
5087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088/* Lookup the character ch in the mapping. If the character
5089 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005090 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
Christian Heimes217cfd12007-12-02 14:31:20 +00005093 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 PyObject *x;
5095
5096 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 x = PyObject_GetItem(mapping, w);
5099 Py_DECREF(w);
5100 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5102 /* No mapping found means: mapping is undefined. */
5103 PyErr_Clear();
5104 x = Py_None;
5105 Py_INCREF(x);
5106 return x;
5107 } else
5108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005110 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005112 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 long value = PyLong_AS_LONG(x);
5114 if (value < 0 || value > 255) {
5115 PyErr_SetString(PyExc_TypeError,
5116 "character mapping must be in range(256)");
5117 Py_DECREF(x);
5118 return NULL;
5119 }
5120 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005122 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 /* wrong return value */
5126 PyErr_Format(PyExc_TypeError,
5127 "character mapping must return integer, bytes or None, not %.400s",
5128 x->ob_type->tp_name);
5129 Py_DECREF(x);
5130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 }
5132}
5133
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005134static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005135charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005137 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5138 /* exponentially overallocate to minimize reallocations */
5139 if (requiredsize < 2*outsize)
5140 requiredsize = 2*outsize;
5141 if (_PyBytes_Resize(outobj, requiredsize))
5142 return -1;
5143 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005144}
5145
Benjamin Peterson14339b62009-01-31 16:36:08 +00005146typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005148}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005150 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 space is available. Return a new reference to the object that
5152 was put in the output buffer, or Py_None, if the mapping was undefined
5153 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005154 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005156charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005159 PyObject *rep;
5160 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005161 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162
Christian Heimes90aa7642007-12-19 02:45:37 +00005163 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005164 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005166 if (res == -1)
5167 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 if (outsize<requiredsize)
5169 if (charmapencode_resize(outobj, outpos, requiredsize))
5170 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005171 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 outstart[(*outpos)++] = (char)res;
5173 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005174 }
5175
5176 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005179 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_DECREF(rep);
5181 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005182 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 if (PyLong_Check(rep)) {
5184 Py_ssize_t requiredsize = *outpos+1;
5185 if (outsize<requiredsize)
5186 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5187 Py_DECREF(rep);
5188 return enc_EXCEPTION;
5189 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005190 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005192 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 else {
5194 const char *repchars = PyBytes_AS_STRING(rep);
5195 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5196 Py_ssize_t requiredsize = *outpos+repsize;
5197 if (outsize<requiredsize)
5198 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5199 Py_DECREF(rep);
5200 return enc_EXCEPTION;
5201 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005202 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 memcpy(outstart + *outpos, repchars, repsize);
5204 *outpos += repsize;
5205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005207 Py_DECREF(rep);
5208 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209}
5210
5211/* handle an error in PyUnicode_EncodeCharmap
5212 Return 0 on success, -1 on error */
5213static
5214int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005215 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005217 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005218 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219{
5220 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005221 Py_ssize_t repsize;
5222 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005223 Py_UNICODE *uni2;
5224 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225 Py_ssize_t collstartpos = *inpos;
5226 Py_ssize_t collendpos = *inpos+1;
5227 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 char *encoding = "charmap";
5229 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005230 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232 /* find all unencodable characters */
5233 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005234 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005235 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 int res = encoding_map_lookup(p[collendpos], mapping);
5237 if (res != -1)
5238 break;
5239 ++collendpos;
5240 continue;
5241 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005242
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 rep = charmapencode_lookup(p[collendpos], mapping);
5244 if (rep==NULL)
5245 return -1;
5246 else if (rep!=Py_None) {
5247 Py_DECREF(rep);
5248 break;
5249 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005250 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 }
5253 /* cache callback name lookup
5254 * (if not done yet, i.e. it's the first error) */
5255 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 if ((errors==NULL) || (!strcmp(errors, "strict")))
5257 *known_errorHandler = 1;
5258 else if (!strcmp(errors, "replace"))
5259 *known_errorHandler = 2;
5260 else if (!strcmp(errors, "ignore"))
5261 *known_errorHandler = 3;
5262 else if (!strcmp(errors, "xmlcharrefreplace"))
5263 *known_errorHandler = 4;
5264 else
5265 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 }
5267 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 case 1: /* strict */
5269 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5270 return -1;
5271 case 2: /* replace */
5272 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 x = charmapencode_output('?', mapping, res, respos);
5274 if (x==enc_EXCEPTION) {
5275 return -1;
5276 }
5277 else if (x==enc_FAILED) {
5278 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5279 return -1;
5280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 }
5282 /* fall through */
5283 case 3: /* ignore */
5284 *inpos = collendpos;
5285 break;
5286 case 4: /* xmlcharrefreplace */
5287 /* generate replacement (temporarily (mis)uses p) */
5288 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 char buffer[2+29+1+1];
5290 char *cp;
5291 sprintf(buffer, "&#%d;", (int)p[collpos]);
5292 for (cp = buffer; *cp; ++cp) {
5293 x = charmapencode_output(*cp, mapping, res, respos);
5294 if (x==enc_EXCEPTION)
5295 return -1;
5296 else if (x==enc_FAILED) {
5297 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5298 return -1;
5299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005300 }
5301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005302 *inpos = collendpos;
5303 break;
5304 default:
5305 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 encoding, reason, p, size, exceptionObject,
5307 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005308 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005310 if (PyBytes_Check(repunicode)) {
5311 /* Directly copy bytes result to output. */
5312 Py_ssize_t outsize = PyBytes_Size(*res);
5313 Py_ssize_t requiredsize;
5314 repsize = PyBytes_Size(repunicode);
5315 requiredsize = *respos + repsize;
5316 if (requiredsize > outsize)
5317 /* Make room for all additional bytes. */
5318 if (charmapencode_resize(res, respos, requiredsize)) {
5319 Py_DECREF(repunicode);
5320 return -1;
5321 }
5322 memcpy(PyBytes_AsString(*res) + *respos,
5323 PyBytes_AsString(repunicode), repsize);
5324 *respos += repsize;
5325 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005326 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005327 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329 /* generate replacement */
5330 repsize = PyUnicode_GET_SIZE(repunicode);
5331 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 x = charmapencode_output(*uni2, mapping, res, respos);
5333 if (x==enc_EXCEPTION) {
5334 return -1;
5335 }
5336 else if (x==enc_FAILED) {
5337 Py_DECREF(repunicode);
5338 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5339 return -1;
5340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005341 }
5342 *inpos = newpos;
5343 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 }
5345 return 0;
5346}
5347
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 Py_ssize_t size,
5350 PyObject *mapping,
5351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 /* output object */
5354 PyObject *res = NULL;
5355 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 PyObject *errorHandler = NULL;
5360 PyObject *exc = NULL;
5361 /* the following variable is used for caching string comparisons
5362 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5363 * 3=ignore, 4=xmlcharrefreplace */
5364 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365
5366 /* Default to Latin-1 */
5367 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 /* allocate enough for a simple encoding without
5371 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005372 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 if (res == NULL)
5374 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005375 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 /* try to encode it */
5380 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5381 if (x==enc_EXCEPTION) /* error */
5382 goto onError;
5383 if (x==enc_FAILED) { /* unencodable character */
5384 if (charmap_encoding_error(p, size, &inpos, mapping,
5385 &exc,
5386 &known_errorHandler, &errorHandler, errors,
5387 &res, &respos)) {
5388 goto onError;
5389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 else
5392 /* done with this character => adjust input position */
5393 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005397 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005398 if (_PyBytes_Resize(&res, respos) < 0)
5399 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 Py_XDECREF(exc);
5402 Py_XDECREF(errorHandler);
5403 return res;
5404
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 Py_XDECREF(res);
5407 Py_XDECREF(exc);
5408 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 return NULL;
5410}
5411
5412PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
5415 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 PyErr_BadArgument();
5417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
5419 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 PyUnicode_GET_SIZE(unicode),
5421 mapping,
5422 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423}
5424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425/* create or adjust a UnicodeTranslateError */
5426static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 const Py_UNICODE *unicode, Py_ssize_t size,
5428 Py_ssize_t startpos, Py_ssize_t endpos,
5429 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005432 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 }
5435 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5437 goto onError;
5438 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5439 goto onError;
5440 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5441 goto onError;
5442 return;
5443 onError:
5444 Py_DECREF(*exceptionObject);
5445 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
5447}
5448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449/* raises a UnicodeTranslateError */
5450static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 const Py_UNICODE *unicode, Py_ssize_t size,
5452 Py_ssize_t startpos, Py_ssize_t endpos,
5453 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454{
5455 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459}
5460
5461/* error handling callback helper:
5462 build arguments, call the callback and check the arguments,
5463 put the result into newpos and return the replacement string, which
5464 has to be freed by the caller */
5465static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 PyObject **errorHandler,
5467 const char *reason,
5468 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5469 Py_ssize_t startpos, Py_ssize_t endpos,
5470 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005472 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005474 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 PyObject *restuple;
5476 PyObject *resunicode;
5477
5478 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 }
5483
5484 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488
5489 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005494 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 Py_DECREF(restuple);
5496 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 }
5498 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 &resunicode, &i_newpos)) {
5500 Py_DECREF(restuple);
5501 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005503 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 else
5506 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005507 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5509 Py_DECREF(restuple);
5510 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 Py_INCREF(resunicode);
5513 Py_DECREF(restuple);
5514 return resunicode;
5515}
5516
5517/* Lookup the character ch in the mapping and put the result in result,
5518 which must be decrefed by the caller.
5519 Return 0 on success, -1 on error */
5520static
5521int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5522{
Christian Heimes217cfd12007-12-02 14:31:20 +00005523 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 PyObject *x;
5525
5526 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 x = PyObject_GetItem(mapping, w);
5529 Py_DECREF(w);
5530 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5532 /* No mapping found means: use 1:1 mapping. */
5533 PyErr_Clear();
5534 *result = NULL;
5535 return 0;
5536 } else
5537 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 }
5539 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 *result = x;
5541 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005543 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 long value = PyLong_AS_LONG(x);
5545 long max = PyUnicode_GetMax();
5546 if (value < 0 || value > max) {
5547 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005548 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 Py_DECREF(x);
5550 return -1;
5551 }
5552 *result = x;
5553 return 0;
5554 }
5555 else if (PyUnicode_Check(x)) {
5556 *result = x;
5557 return 0;
5558 }
5559 else {
5560 /* wrong return value */
5561 PyErr_SetString(PyExc_TypeError,
5562 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005563 Py_DECREF(x);
5564 return -1;
5565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566}
5567/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 if not reallocate and adjust various state variables.
5569 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570static
Walter Dörwald4894c302003-10-24 14:25:28 +00005571int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005574 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005575 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* remember old output position */
5577 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5578 /* exponentially overallocate to minimize reallocations */
5579 if (requiredsize < 2 * oldsize)
5580 requiredsize = 2 * oldsize;
5581 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5582 return -1;
5583 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584 }
5585 return 0;
5586}
5587/* lookup the character, put the result in the output string and adjust
5588 various state variables. Return a new reference to the object that
5589 was put in the output buffer in *result, or Py_None, if the mapping was
5590 undefined (in which case no character was written).
5591 The called must decref result.
5592 Return 0 on success, -1 on error. */
5593static
Walter Dörwald4894c302003-10-24 14:25:28 +00005594int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5596 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597{
Walter Dörwald4894c302003-10-24 14:25:28 +00005598 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 /* not found => default to 1:1 mapping */
5602 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 }
5604 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005606 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 /* no overflow check, because we know that the space is enough */
5608 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609 }
5610 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5612 if (repsize==1) {
5613 /* no overflow check, because we know that the space is enough */
5614 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5615 }
5616 else if (repsize!=0) {
5617 /* more than one character */
5618 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5619 (insize - (curinp-startinp)) +
5620 repsize - 1;
5621 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5622 return -1;
5623 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5624 *outp += repsize;
5625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005626 }
5627 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 return 0;
5630}
5631
5632PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 Py_ssize_t size,
5634 PyObject *mapping,
5635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 /* output object */
5638 PyObject *res = NULL;
5639 /* pointers to the beginning and end+1 of input */
5640 const Py_UNICODE *startp = p;
5641 const Py_UNICODE *endp = p + size;
5642 /* pointer into the output */
5643 Py_UNICODE *str;
5644 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005645 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 char *reason = "character maps to <undefined>";
5647 PyObject *errorHandler = NULL;
5648 PyObject *exc = NULL;
5649 /* the following variable is used for caching string comparisons
5650 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5651 * 3=ignore, 4=xmlcharrefreplace */
5652 int known_errorHandler = -1;
5653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 PyErr_BadArgument();
5656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658
5659 /* allocate enough for a simple 1:1 translation without
5660 replacements, if we need more, we'll resize */
5661 res = PyUnicode_FromUnicode(NULL, size);
5662 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* try to encode it */
5670 PyObject *x = NULL;
5671 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5672 Py_XDECREF(x);
5673 goto onError;
5674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005675 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 if (x!=Py_None) /* it worked => adjust input pointer */
5677 ++p;
5678 else { /* untranslatable character */
5679 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5680 Py_ssize_t repsize;
5681 Py_ssize_t newpos;
5682 Py_UNICODE *uni2;
5683 /* startpos for collecting untranslatable chars */
5684 const Py_UNICODE *collstart = p;
5685 const Py_UNICODE *collend = p+1;
5686 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 /* find all untranslatable characters */
5689 while (collend < endp) {
5690 if (charmaptranslate_lookup(*collend, mapping, &x))
5691 goto onError;
5692 Py_XDECREF(x);
5693 if (x!=Py_None)
5694 break;
5695 ++collend;
5696 }
5697 /* cache callback name lookup
5698 * (if not done yet, i.e. it's the first error) */
5699 if (known_errorHandler==-1) {
5700 if ((errors==NULL) || (!strcmp(errors, "strict")))
5701 known_errorHandler = 1;
5702 else if (!strcmp(errors, "replace"))
5703 known_errorHandler = 2;
5704 else if (!strcmp(errors, "ignore"))
5705 known_errorHandler = 3;
5706 else if (!strcmp(errors, "xmlcharrefreplace"))
5707 known_errorHandler = 4;
5708 else
5709 known_errorHandler = 0;
5710 }
5711 switch (known_errorHandler) {
5712 case 1: /* strict */
5713 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005714 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 case 2: /* replace */
5716 /* No need to check for space, this is a 1:1 replacement */
5717 for (coll = collstart; coll<collend; ++coll)
5718 *str++ = '?';
5719 /* fall through */
5720 case 3: /* ignore */
5721 p = collend;
5722 break;
5723 case 4: /* xmlcharrefreplace */
5724 /* generate replacement (temporarily (mis)uses p) */
5725 for (p = collstart; p < collend; ++p) {
5726 char buffer[2+29+1+1];
5727 char *cp;
5728 sprintf(buffer, "&#%d;", (int)*p);
5729 if (charmaptranslate_makespace(&res, &str,
5730 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5731 goto onError;
5732 for (cp = buffer; *cp; ++cp)
5733 *str++ = *cp;
5734 }
5735 p = collend;
5736 break;
5737 default:
5738 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5739 reason, startp, size, &exc,
5740 collstart-startp, collend-startp, &newpos);
5741 if (repunicode == NULL)
5742 goto onError;
5743 /* generate replacement */
5744 repsize = PyUnicode_GET_SIZE(repunicode);
5745 if (charmaptranslate_makespace(&res, &str,
5746 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5747 Py_DECREF(repunicode);
5748 goto onError;
5749 }
5750 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5751 *str++ = *uni2;
5752 p = startp + newpos;
5753 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005755 }
5756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 /* Resize if we allocated to much */
5758 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005759 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 if (PyUnicode_Resize(&res, respos) < 0)
5761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 }
5763 Py_XDECREF(exc);
5764 Py_XDECREF(errorHandler);
5765 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 Py_XDECREF(res);
5769 Py_XDECREF(exc);
5770 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 return NULL;
5772}
5773
5774PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 PyObject *mapping,
5776 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
5778 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005779
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 str = PyUnicode_FromObject(str);
5781 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 PyUnicode_GET_SIZE(str),
5785 mapping,
5786 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 Py_DECREF(str);
5788 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005789
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 Py_XDECREF(str);
5792 return NULL;
5793}
Tim Petersced69f82003-09-16 20:30:58 +00005794
Guido van Rossum9e896b32000-04-05 20:11:21 +00005795/* --- Decimal Encoder ---------------------------------------------------- */
5796
5797int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 Py_ssize_t length,
5799 char *output,
5800 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005801{
5802 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 PyObject *errorHandler = NULL;
5804 PyObject *exc = NULL;
5805 const char *encoding = "decimal";
5806 const char *reason = "invalid decimal Unicode string";
5807 /* the following variable is used for caching string comparisons
5808 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5809 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005810
5811 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 PyErr_BadArgument();
5813 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005814 }
5815
5816 p = s;
5817 end = s + length;
5818 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 register Py_UNICODE ch = *p;
5820 int decimal;
5821 PyObject *repunicode;
5822 Py_ssize_t repsize;
5823 Py_ssize_t newpos;
5824 Py_UNICODE *uni2;
5825 Py_UNICODE *collstart;
5826 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005827
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005829 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 ++p;
5831 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 decimal = Py_UNICODE_TODECIMAL(ch);
5834 if (decimal >= 0) {
5835 *output++ = '0' + decimal;
5836 ++p;
5837 continue;
5838 }
5839 if (0 < ch && ch < 256) {
5840 *output++ = (char)ch;
5841 ++p;
5842 continue;
5843 }
5844 /* All other characters are considered unencodable */
5845 collstart = p;
5846 collend = p+1;
5847 while (collend < end) {
5848 if ((0 < *collend && *collend < 256) ||
5849 !Py_UNICODE_ISSPACE(*collend) ||
5850 Py_UNICODE_TODECIMAL(*collend))
5851 break;
5852 }
5853 /* cache callback name lookup
5854 * (if not done yet, i.e. it's the first error) */
5855 if (known_errorHandler==-1) {
5856 if ((errors==NULL) || (!strcmp(errors, "strict")))
5857 known_errorHandler = 1;
5858 else if (!strcmp(errors, "replace"))
5859 known_errorHandler = 2;
5860 else if (!strcmp(errors, "ignore"))
5861 known_errorHandler = 3;
5862 else if (!strcmp(errors, "xmlcharrefreplace"))
5863 known_errorHandler = 4;
5864 else
5865 known_errorHandler = 0;
5866 }
5867 switch (known_errorHandler) {
5868 case 1: /* strict */
5869 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5870 goto onError;
5871 case 2: /* replace */
5872 for (p = collstart; p < collend; ++p)
5873 *output++ = '?';
5874 /* fall through */
5875 case 3: /* ignore */
5876 p = collend;
5877 break;
5878 case 4: /* xmlcharrefreplace */
5879 /* generate replacement (temporarily (mis)uses p) */
5880 for (p = collstart; p < collend; ++p)
5881 output += sprintf(output, "&#%d;", (int)*p);
5882 p = collend;
5883 break;
5884 default:
5885 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5886 encoding, reason, s, length, &exc,
5887 collstart-s, collend-s, &newpos);
5888 if (repunicode == NULL)
5889 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005890 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005891 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005892 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5893 Py_DECREF(repunicode);
5894 goto onError;
5895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 /* generate replacement */
5897 repsize = PyUnicode_GET_SIZE(repunicode);
5898 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5899 Py_UNICODE ch = *uni2;
5900 if (Py_UNICODE_ISSPACE(ch))
5901 *output++ = ' ';
5902 else {
5903 decimal = Py_UNICODE_TODECIMAL(ch);
5904 if (decimal >= 0)
5905 *output++ = '0' + decimal;
5906 else if (0 < ch && ch < 256)
5907 *output++ = (char)ch;
5908 else {
5909 Py_DECREF(repunicode);
5910 raise_encode_exception(&exc, encoding,
5911 s, length, collstart-s, collend-s, reason);
5912 goto onError;
5913 }
5914 }
5915 }
5916 p = s + newpos;
5917 Py_DECREF(repunicode);
5918 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005919 }
5920 /* 0-terminate the output string */
5921 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922 Py_XDECREF(exc);
5923 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005924 return 0;
5925
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 Py_XDECREF(exc);
5928 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005929 return -1;
5930}
5931
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932/* --- Helpers ------------------------------------------------------------ */
5933
Eric Smith8c663262007-08-25 02:26:07 +00005934#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005935#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005936
Thomas Wouters477c8d52006-05-27 19:21:47 +00005937#include "stringlib/count.h"
5938#include "stringlib/find.h"
5939#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005940#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005941
Eric Smith5807c412008-05-11 21:00:57 +00005942#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005943#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005944#include "stringlib/localeutil.h"
5945
Thomas Wouters477c8d52006-05-27 19:21:47 +00005946/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005947#define ADJUST_INDICES(start, end, len) \
5948 if (end > len) \
5949 end = len; \
5950 else if (end < 0) { \
5951 end += len; \
5952 if (end < 0) \
5953 end = 0; \
5954 } \
5955 if (start < 0) { \
5956 start += len; \
5957 if (start < 0) \
5958 start = 0; \
5959 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962 PyObject *substr,
5963 Py_ssize_t start,
5964 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005967 PyUnicodeObject* str_obj;
5968 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005969
Thomas Wouters477c8d52006-05-27 19:21:47 +00005970 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5971 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5974 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 Py_DECREF(str_obj);
5976 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 }
Tim Petersced69f82003-09-16 20:30:58 +00005978
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005979 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005981 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5982 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005983 );
5984
5985 Py_DECREF(sub_obj);
5986 Py_DECREF(str_obj);
5987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 return result;
5989}
5990
Martin v. Löwis18e16552006-02-15 17:27:45 +00005991Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 PyObject *sub,
5993 Py_ssize_t start,
5994 Py_ssize_t end,
5995 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006000 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 sub = PyUnicode_FromObject(sub);
6003 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 Py_DECREF(str);
6005 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Tim Petersced69f82003-09-16 20:30:58 +00006007
Thomas Wouters477c8d52006-05-27 19:21:47 +00006008 if (direction > 0)
6009 result = stringlib_find_slice(
6010 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6011 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6012 start, end
6013 );
6014 else
6015 result = stringlib_rfind_slice(
6016 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6017 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6018 start, end
6019 );
6020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006022 Py_DECREF(sub);
6023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return result;
6025}
6026
Tim Petersced69f82003-09-16 20:30:58 +00006027static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 PyUnicodeObject *substring,
6030 Py_ssize_t start,
6031 Py_ssize_t end,
6032 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 if (substring->length == 0)
6035 return 1;
6036
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006037 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 end -= substring->length;
6039 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 if (Py_UNICODE_MATCH(self, end, substring))
6044 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 } else {
6046 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 }
6049
6050 return 0;
6051}
6052
Martin v. Löwis18e16552006-02-15 17:27:45 +00006053Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 PyObject *substr,
6055 Py_ssize_t start,
6056 Py_ssize_t end,
6057 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 str = PyUnicode_FromObject(str);
6062 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 substr = PyUnicode_FromObject(substr);
6065 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 Py_DECREF(str);
6067 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 }
Tim Petersced69f82003-09-16 20:30:58 +00006069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 (PyUnicodeObject *)substr,
6072 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 Py_DECREF(str);
6074 Py_DECREF(substr);
6075 return result;
6076}
6077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078/* Apply fixfct filter to the Unicode object self and return a
6079 reference to the modified object */
6080
Tim Petersced69f82003-09-16 20:30:58 +00006081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
6085
6086 PyUnicodeObject *u;
6087
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006088 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006091
6092 Py_UNICODE_COPY(u->str, self->str, self->length);
6093
Tim Peters7a29bd52001-09-12 03:03:31 +00006094 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 /* fixfct should return TRUE if it modified the buffer. If
6096 FALSE, return a reference to the original buffer instead
6097 (to save space, not time) */
6098 Py_INCREF(self);
6099 Py_DECREF(u);
6100 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 }
6102 return (PyObject*) u;
6103}
6104
Tim Petersced69f82003-09-16 20:30:58 +00006105static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106int fixupper(PyUnicodeObject *self)
6107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 Py_UNICODE *s = self->str;
6110 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006114
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 ch = Py_UNICODE_TOUPPER(*s);
6116 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 *s = ch;
6119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 s++;
6121 }
6122
6123 return status;
6124}
6125
Tim Petersced69f82003-09-16 20:30:58 +00006126static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127int fixlower(PyUnicodeObject *self)
6128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 Py_UNICODE *s = self->str;
6131 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006135
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 ch = Py_UNICODE_TOLOWER(*s);
6137 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 *s = ch;
6140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 s++;
6142 }
6143
6144 return status;
6145}
6146
Tim Petersced69f82003-09-16 20:30:58 +00006147static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148int fixswapcase(PyUnicodeObject *self)
6149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006150 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 Py_UNICODE *s = self->str;
6152 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 while (len-- > 0) {
6155 if (Py_UNICODE_ISUPPER(*s)) {
6156 *s = Py_UNICODE_TOLOWER(*s);
6157 status = 1;
6158 } else if (Py_UNICODE_ISLOWER(*s)) {
6159 *s = Py_UNICODE_TOUPPER(*s);
6160 status = 1;
6161 }
6162 s++;
6163 }
6164
6165 return status;
6166}
6167
Tim Petersced69f82003-09-16 20:30:58 +00006168static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169int fixcapitalize(PyUnicodeObject *self)
6170{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006171 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006172 Py_UNICODE *s = self->str;
6173 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006174
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006175 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006177 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 *s = Py_UNICODE_TOUPPER(*s);
6179 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006181 s++;
6182 while (--len > 0) {
6183 if (Py_UNICODE_ISUPPER(*s)) {
6184 *s = Py_UNICODE_TOLOWER(*s);
6185 status = 1;
6186 }
6187 s++;
6188 }
6189 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
6192static
6193int fixtitle(PyUnicodeObject *self)
6194{
6195 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6196 register Py_UNICODE *e;
6197 int previous_is_cased;
6198
6199 /* Shortcut for single character strings */
6200 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6202 if (*p != ch) {
6203 *p = ch;
6204 return 1;
6205 }
6206 else
6207 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 }
Tim Petersced69f82003-09-16 20:30:58 +00006209
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 e = p + PyUnicode_GET_SIZE(self);
6211 previous_is_cased = 0;
6212 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006214
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 if (previous_is_cased)
6216 *p = Py_UNICODE_TOLOWER(ch);
6217 else
6218 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 if (Py_UNICODE_ISLOWER(ch) ||
6221 Py_UNICODE_ISUPPER(ch) ||
6222 Py_UNICODE_ISTITLE(ch))
6223 previous_is_cased = 1;
6224 else
6225 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 }
6227 return 1;
6228}
6229
Tim Peters8ce9f162004-08-27 01:49:32 +00006230PyObject *
6231PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232{
Skip Montanaro6543b452004-09-16 03:28:13 +00006233 const Py_UNICODE blank = ' ';
6234 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006235 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006236 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006237 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6238 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006239 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6240 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006241 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006242 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243
Tim Peters05eba1f2004-08-27 21:32:02 +00006244 fseq = PySequence_Fast(seq, "");
6245 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006247 }
6248
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006249 /* NOTE: the following code can't call back into Python code,
6250 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006251 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006252
Tim Peters05eba1f2004-08-27 21:32:02 +00006253 seqlen = PySequence_Fast_GET_SIZE(fseq);
6254 /* If empty sequence, return u"". */
6255 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006256 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6257 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006258 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006259 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006260 /* If singleton sequence with an exact Unicode, return that. */
6261 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 item = items[0];
6263 if (PyUnicode_CheckExact(item)) {
6264 Py_INCREF(item);
6265 res = (PyUnicodeObject *)item;
6266 goto Done;
6267 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006268 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006269 else {
6270 /* Set up sep and seplen */
6271 if (separator == NULL) {
6272 sep = &blank;
6273 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006274 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006275 else {
6276 if (!PyUnicode_Check(separator)) {
6277 PyErr_Format(PyExc_TypeError,
6278 "separator: expected str instance,"
6279 " %.80s found",
6280 Py_TYPE(separator)->tp_name);
6281 goto onError;
6282 }
6283 sep = PyUnicode_AS_UNICODE(separator);
6284 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006285 }
6286 }
6287
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006288 /* There are at least two things to join, or else we have a subclass
6289 * of str in the sequence.
6290 * Do a pre-pass to figure out the total amount of space we'll
6291 * need (sz), and see whether all argument are strings.
6292 */
6293 sz = 0;
6294 for (i = 0; i < seqlen; i++) {
6295 const Py_ssize_t old_sz = sz;
6296 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 if (!PyUnicode_Check(item)) {
6298 PyErr_Format(PyExc_TypeError,
6299 "sequence item %zd: expected str instance,"
6300 " %.80s found",
6301 i, Py_TYPE(item)->tp_name);
6302 goto onError;
6303 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006304 sz += PyUnicode_GET_SIZE(item);
6305 if (i != 0)
6306 sz += seplen;
6307 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6308 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006310 goto onError;
6311 }
6312 }
Tim Petersced69f82003-09-16 20:30:58 +00006313
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006314 res = _PyUnicode_New(sz);
6315 if (res == NULL)
6316 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006317
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006318 /* Catenate everything. */
6319 res_p = PyUnicode_AS_UNICODE(res);
6320 for (i = 0; i < seqlen; ++i) {
6321 Py_ssize_t itemlen;
6322 item = items[i];
6323 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 /* Copy item, and maybe the separator. */
6325 if (i) {
6326 Py_UNICODE_COPY(res_p, sep, seplen);
6327 res_p += seplen;
6328 }
6329 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6330 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006331 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006334 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 return (PyObject *)res;
6336
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006338 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006339 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 return NULL;
6341}
6342
Tim Petersced69f82003-09-16 20:30:58 +00006343static
6344PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 Py_ssize_t left,
6346 Py_ssize_t right,
6347 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348{
6349 PyUnicodeObject *u;
6350
6351 if (left < 0)
6352 left = 0;
6353 if (right < 0)
6354 right = 0;
6355
Tim Peters7a29bd52001-09-12 03:03:31 +00006356 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 Py_INCREF(self);
6358 return self;
6359 }
6360
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006361 if (left > PY_SSIZE_T_MAX - self->length ||
6362 right > PY_SSIZE_T_MAX - (left + self->length)) {
6363 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6364 return NULL;
6365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 u = _PyUnicode_New(left + self->length + right);
6367 if (u) {
6368 if (left)
6369 Py_UNICODE_FILL(u->str, fill, left);
6370 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6371 if (right)
6372 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6373 }
6374
6375 return u;
6376}
6377
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006378PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381
6382 string = PyUnicode_FromObject(string);
6383 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006386 list = stringlib_splitlines(
6387 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6388 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389
6390 Py_DECREF(string);
6391 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392}
6393
Tim Petersced69f82003-09-16 20:30:58 +00006394static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 PyUnicodeObject *substring,
6397 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006400 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006403 return stringlib_split_whitespace(
6404 (PyObject*) self, self->str, self->length, maxcount
6405 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006407 return stringlib_split(
6408 (PyObject*) self, self->str, self->length,
6409 substring->str, substring->length,
6410 maxcount
6411 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412}
6413
Tim Petersced69f82003-09-16 20:30:58 +00006414static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006415PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 PyUnicodeObject *substring,
6417 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006418{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006419 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006420 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006421
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006423 return stringlib_rsplit_whitespace(
6424 (PyObject*) self, self->str, self->length, maxcount
6425 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006426
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006427 return stringlib_rsplit(
6428 (PyObject*) self, self->str, self->length,
6429 substring->str, substring->length,
6430 maxcount
6431 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006432}
6433
6434static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 PyUnicodeObject *str1,
6437 PyUnicodeObject *str2,
6438 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439{
6440 PyUnicodeObject *u;
6441
6442 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006444 else if (maxcount == 0 || self->length == 0)
6445 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
Thomas Wouters477c8d52006-05-27 19:21:47 +00006447 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006448 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006450 if (str1->length == 0)
6451 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452 if (str1->length == 1) {
6453 /* replace characters */
6454 Py_UNICODE u1, u2;
6455 if (!findchar(self->str, self->length, str1->str[0]))
6456 goto nothing;
6457 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6458 if (!u)
6459 return NULL;
6460 Py_UNICODE_COPY(u->str, self->str, self->length);
6461 u1 = str1->str[0];
6462 u2 = str2->str[0];
6463 for (i = 0; i < u->length; i++)
6464 if (u->str[i] == u1) {
6465 if (--maxcount < 0)
6466 break;
6467 u->str[i] = u2;
6468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006470 i = stringlib_find(
6471 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473 if (i < 0)
6474 goto nothing;
6475 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6476 if (!u)
6477 return NULL;
6478 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006479
6480 /* change everything in-place, starting with this one */
6481 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6482 i += str1->length;
6483
6484 while ( --maxcount > 0) {
6485 i = stringlib_find(self->str+i, self->length-i,
6486 str1->str, str1->length,
6487 i);
6488 if (i == -1)
6489 break;
6490 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6491 i += str1->length;
6492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006495
6496 Py_ssize_t n, i, j, e;
6497 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 Py_UNICODE *p;
6499
6500 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006501 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6502 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 if (n == 0)
6504 goto nothing;
6505 /* new_size = self->length + n * (str2->length - str1->length)); */
6506 delta = (str2->length - str1->length);
6507 if (delta == 0) {
6508 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 product = n * (str2->length - str1->length);
6511 if ((product / (str2->length - str1->length)) != n) {
6512 PyErr_SetString(PyExc_OverflowError,
6513 "replace string is too long");
6514 return NULL;
6515 }
6516 new_size = self->length + product;
6517 if (new_size < 0) {
6518 PyErr_SetString(PyExc_OverflowError,
6519 "replace string is too long");
6520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 }
6522 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 u = _PyUnicode_New(new_size);
6524 if (!u)
6525 return NULL;
6526 i = 0;
6527 p = u->str;
6528 e = self->length - str1->length;
6529 if (str1->length > 0) {
6530 while (n-- > 0) {
6531 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006532 j = stringlib_find(self->str+i, self->length-i,
6533 str1->str, str1->length,
6534 i);
6535 if (j == -1)
6536 break;
6537 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006538 /* copy unchanged part [i:j] */
6539 Py_UNICODE_COPY(p, self->str+i, j-i);
6540 p += j - i;
6541 }
6542 /* copy substitution string */
6543 if (str2->length > 0) {
6544 Py_UNICODE_COPY(p, str2->str, str2->length);
6545 p += str2->length;
6546 }
6547 i = j + str1->length;
6548 }
6549 if (i < self->length)
6550 /* copy tail [i:] */
6551 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6552 } else {
6553 /* interleave */
6554 while (n > 0) {
6555 Py_UNICODE_COPY(p, str2->str, str2->length);
6556 p += str2->length;
6557 if (--n <= 0)
6558 break;
6559 *p++ = self->str[i++];
6560 }
6561 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567 /* nothing to replace; return original string (when possible) */
6568 if (PyUnicode_CheckExact(self)) {
6569 Py_INCREF(self);
6570 return (PyObject *) self;
6571 }
6572 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573}
6574
6575/* --- Unicode Object Methods --------------------------------------------- */
6576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579\n\
6580Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
6583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006584unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 return fixup(self, fixtitle);
6587}
6588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006589PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591\n\
6592Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
6595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return fixup(self, fixcapitalize);
6599}
6600
6601#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604\n\
6605Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
6608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006609unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
6611 PyObject *list;
6612 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006613 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 /* Split into words */
6616 list = split(self, NULL, -1);
6617 if (!list)
6618 return NULL;
6619
6620 /* Capitalize each word */
6621 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6622 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 if (item == NULL)
6625 goto onError;
6626 Py_DECREF(PyList_GET_ITEM(list, i));
6627 PyList_SET_ITEM(list, i, item);
6628 }
6629
6630 /* Join the words to form a new string */
6631 item = PyUnicode_Join(NULL, list);
6632
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_DECREF(list);
6635 return (PyObject *)item;
6636}
6637#endif
6638
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006639/* Argument converter. Coerces to a single unicode character */
6640
6641static int
6642convert_uc(PyObject *obj, void *addr)
6643{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006644 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6645 PyObject *uniobj;
6646 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006647
Benjamin Peterson14339b62009-01-31 16:36:08 +00006648 uniobj = PyUnicode_FromObject(obj);
6649 if (uniobj == NULL) {
6650 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006652 return 0;
6653 }
6654 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6655 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 Py_DECREF(uniobj);
6658 return 0;
6659 }
6660 unistr = PyUnicode_AS_UNICODE(uniobj);
6661 *fillcharloc = unistr[0];
6662 Py_DECREF(uniobj);
6663 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006669Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006670done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject *
6673unicode_center(PyUnicodeObject *self, PyObject *args)
6674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t marg, left;
6676 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006677 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
Thomas Woutersde017742006-02-16 19:34:37 +00006679 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
6681
Tim Peters7a29bd52001-09-12 03:03:31 +00006682 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 Py_INCREF(self);
6684 return (PyObject*) self;
6685 }
6686
6687 marg = width - self->length;
6688 left = marg / 2 + (marg & width & 1);
6689
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006690 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
Marc-André Lemburge5034372000-08-08 08:04:29 +00006693#if 0
6694
6695/* This code should go into some future Unicode collation support
6696 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006697 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006698
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006699/* speedy UTF-16 code point order comparison */
6700/* gleaned from: */
6701/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006703static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006704{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006705 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006706 0, 0, 0, 0, 0, 0, 0, 0,
6707 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006708 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006709};
6710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711static int
6712unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6713{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 Py_UNICODE *s1 = str1->str;
6717 Py_UNICODE *s2 = str2->str;
6718
6719 len1 = str1->length;
6720 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006723 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006724
6725 c1 = *s1++;
6726 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006727
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 if (c1 > (1<<11) * 26)
6729 c1 += utf16Fixup[c1>>11];
6730 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006731 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006732 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006733
6734 if (c1 != c2)
6735 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006737 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
6739
6740 return (len1 < len2) ? -1 : (len1 != len2);
6741}
6742
Marc-André Lemburge5034372000-08-08 08:04:29 +00006743#else
6744
6745static int
6746unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6747{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006749
6750 Py_UNICODE *s1 = str1->str;
6751 Py_UNICODE *s2 = str2->str;
6752
6753 len1 = str1->length;
6754 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006755
Marc-André Lemburge5034372000-08-08 08:04:29 +00006756 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006757 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006758
Fredrik Lundh45714e92001-06-26 16:39:36 +00006759 c1 = *s1++;
6760 c2 = *s2++;
6761
6762 if (c1 != c2)
6763 return (c1 < c2) ? -1 : 1;
6764
Marc-André Lemburge5034372000-08-08 08:04:29 +00006765 len1--; len2--;
6766 }
6767
6768 return (len1 < len2) ? -1 : (len1 != len2);
6769}
6770
6771#endif
6772
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006776 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6777 return unicode_compare((PyUnicodeObject *)left,
6778 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006779 PyErr_Format(PyExc_TypeError,
6780 "Can't compare %.100s and %.100s",
6781 left->ob_type->tp_name,
6782 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 return -1;
6784}
6785
Martin v. Löwis5b222132007-06-10 09:51:05 +00006786int
6787PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6788{
6789 int i;
6790 Py_UNICODE *id;
6791 assert(PyUnicode_Check(uni));
6792 id = PyUnicode_AS_UNICODE(uni);
6793 /* Compare Unicode string and source character set string */
6794 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 if (id[i] != str[i])
6796 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006797 /* This check keeps Python strings that end in '\0' from comparing equal
6798 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006799 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006801 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006803 return 0;
6804}
6805
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006808 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006809
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006810PyObject *PyUnicode_RichCompare(PyObject *left,
6811 PyObject *right,
6812 int op)
6813{
6814 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006815
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006816 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6817 PyObject *v;
6818 if (((PyUnicodeObject *) left)->length !=
6819 ((PyUnicodeObject *) right)->length) {
6820 if (op == Py_EQ) {
6821 Py_INCREF(Py_False);
6822 return Py_False;
6823 }
6824 if (op == Py_NE) {
6825 Py_INCREF(Py_True);
6826 return Py_True;
6827 }
6828 }
6829 if (left == right)
6830 result = 0;
6831 else
6832 result = unicode_compare((PyUnicodeObject *)left,
6833 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006834
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006835 /* Convert the return value to a Boolean */
6836 switch (op) {
6837 case Py_EQ:
6838 v = TEST_COND(result == 0);
6839 break;
6840 case Py_NE:
6841 v = TEST_COND(result != 0);
6842 break;
6843 case Py_LE:
6844 v = TEST_COND(result <= 0);
6845 break;
6846 case Py_GE:
6847 v = TEST_COND(result >= 0);
6848 break;
6849 case Py_LT:
6850 v = TEST_COND(result == -1);
6851 break;
6852 case Py_GT:
6853 v = TEST_COND(result == 1);
6854 break;
6855 default:
6856 PyErr_BadArgument();
6857 return NULL;
6858 }
6859 Py_INCREF(v);
6860 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006861 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006863 Py_INCREF(Py_NotImplemented);
6864 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006865}
6866
Guido van Rossum403d68b2000-03-13 15:55:09 +00006867int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006869{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006870 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006871 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006872
6873 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006874 sub = PyUnicode_FromObject(element);
6875 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 PyErr_Format(PyExc_TypeError,
6877 "'in <string>' requires string as left operand, not %s",
6878 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006879 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006880 }
6881
Thomas Wouters477c8d52006-05-27 19:21:47 +00006882 str = PyUnicode_FromObject(container);
6883 if (!str) {
6884 Py_DECREF(sub);
6885 return -1;
6886 }
6887
6888 result = stringlib_contains_obj(str, sub);
6889
6890 Py_DECREF(str);
6891 Py_DECREF(sub);
6892
Guido van Rossum403d68b2000-03-13 15:55:09 +00006893 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006894}
6895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896/* Concat to string or Unicode object giving a new Unicode object. */
6897
6898PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 PyUnicodeObject *u = NULL, *v = NULL, *w;
6902
6903 /* Coerce the two arguments */
6904 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6905 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6908 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
6911 /* Shortcuts */
6912 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 Py_DECREF(v);
6914 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 }
6916 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 Py_DECREF(u);
6918 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 }
6920
6921 /* Concat the two Unicode strings */
6922 w = _PyUnicode_New(u->length + v->length);
6923 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 Py_UNICODE_COPY(w->str, u->str, u->length);
6926 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6927
6928 Py_DECREF(u);
6929 Py_DECREF(v);
6930 return (PyObject *)w;
6931
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 Py_XDECREF(u);
6934 Py_XDECREF(v);
6935 return NULL;
6936}
6937
Walter Dörwald1ab83302007-05-18 17:15:44 +00006938void
6939PyUnicode_Append(PyObject **pleft, PyObject *right)
6940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006941 PyObject *new;
6942 if (*pleft == NULL)
6943 return;
6944 if (right == NULL || !PyUnicode_Check(*pleft)) {
6945 Py_DECREF(*pleft);
6946 *pleft = NULL;
6947 return;
6948 }
6949 new = PyUnicode_Concat(*pleft, right);
6950 Py_DECREF(*pleft);
6951 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006952}
6953
6954void
6955PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006957 PyUnicode_Append(pleft, right);
6958 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006959}
6960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006961PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006964Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006965string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
6968static PyObject *
6969unicode_count(PyUnicodeObject *self, PyObject *args)
6970{
6971 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006972 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006973 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 PyObject *result;
6975
Guido van Rossumb8872e62000-05-09 14:14:27 +00006976 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 return NULL;
6979
6980 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006981 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006984
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006985 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006986 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006987 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006988 substring->str, substring->length,
6989 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006990 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 return result;
6995}
6996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007000Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007001to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007002handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7004'xmlcharrefreplace' as well as any other name registered with\n\
7005codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
7007static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007008unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007010 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 char *encoding = NULL;
7012 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007013 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007014
Benjamin Peterson308d6372009-09-18 21:42:35 +00007015 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7016 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007018 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007019 if (v == NULL)
7020 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007021 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007022 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007023 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007024 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007025 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007026 Py_DECREF(v);
7027 return NULL;
7028 }
7029 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007030
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007032 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007033}
7034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037\n\
7038Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007039If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041static PyObject*
7042unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7043{
7044 Py_UNICODE *e;
7045 Py_UNICODE *p;
7046 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007047 Py_UNICODE *qe;
7048 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 PyUnicodeObject *u;
7050 int tabsize = 8;
7051
7052 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Thomas Wouters7e474022000-07-16 12:04:32 +00007055 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007056 i = 0; /* chars up to and including most recent \n or \r */
7057 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7058 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 for (p = self->str; p < e; p++)
7060 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 if (tabsize > 0) {
7062 incr = tabsize - (j % tabsize); /* cannot overflow */
7063 if (j > PY_SSIZE_T_MAX - incr)
7064 goto overflow1;
7065 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 if (j > PY_SSIZE_T_MAX - 1)
7070 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 j++;
7072 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 if (i > PY_SSIZE_T_MAX - j)
7074 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007076 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
7078 }
7079
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007080 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007082
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 /* Second pass: create output string and fill it */
7084 u = _PyUnicode_New(i + j);
7085 if (!u)
7086 return NULL;
7087
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007088 j = 0; /* same as in first pass */
7089 q = u->str; /* next output char */
7090 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091
7092 for (p = self->str; p < e; p++)
7093 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 if (tabsize > 0) {
7095 i = tabsize - (j % tabsize);
7096 j += i;
7097 while (i--) {
7098 if (q >= qe)
7099 goto overflow2;
7100 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007101 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 else {
7105 if (q >= qe)
7106 goto overflow2;
7107 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007108 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 if (*p == '\n' || *p == '\r')
7110 j = 0;
7111 }
7112
7113 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007114
7115 overflow2:
7116 Py_DECREF(u);
7117 overflow1:
7118 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120}
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124\n\
7125Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007126such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127arguments start and end are interpreted as in slice notation.\n\
7128\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131static PyObject *
7132unicode_find(PyUnicodeObject *self, PyObject *args)
7133{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007134 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007135 Py_ssize_t start;
7136 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007137 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
Christian Heimes9cd17752007-11-18 19:35:23 +00007139 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
Thomas Wouters477c8d52006-05-27 19:21:47 +00007142 result = stringlib_find_slice(
7143 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7144 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7145 start, end
7146 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147
7148 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007149
Christian Heimes217cfd12007-12-02 14:31:20 +00007150 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151}
7152
7153static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155{
7156 if (index < 0 || index >= self->length) {
7157 PyErr_SetString(PyExc_IndexError, "string index out of range");
7158 return NULL;
7159 }
7160
7161 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7162}
7163
Guido van Rossumc2504932007-09-18 19:42:40 +00007164/* Believe it or not, this produces the same value for ASCII strings
7165 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007167unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168{
Guido van Rossumc2504932007-09-18 19:42:40 +00007169 Py_ssize_t len;
7170 Py_UNICODE *p;
7171 long x;
7172
7173 if (self->hash != -1)
7174 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007175 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007176 p = self->str;
7177 x = *p << 7;
7178 while (--len >= 0)
7179 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007180 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007181 if (x == -1)
7182 x = -2;
7183 self->hash = x;
7184 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185}
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192static PyObject *
7193unicode_index(PyUnicodeObject *self, PyObject *args)
7194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007195 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007196 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007197 Py_ssize_t start;
7198 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
Christian Heimes9cd17752007-11-18 19:35:23 +00007200 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203 result = stringlib_find_slice(
7204 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7205 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7206 start, end
7207 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208
7209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (result < 0) {
7212 PyErr_SetString(PyExc_ValueError, "substring not found");
7213 return NULL;
7214 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007215
Christian Heimes217cfd12007-12-02 14:31:20 +00007216 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217}
7218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007219PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007222Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007223at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224
7225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007226unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
7228 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7229 register const Py_UNICODE *e;
7230 int cased;
7231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 /* Shortcut for single character strings */
7233 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007236 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007237 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007239
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 e = p + PyUnicode_GET_SIZE(self);
7241 cased = 0;
7242 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007244
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7246 return PyBool_FromLong(0);
7247 else if (!cased && Py_UNICODE_ISLOWER(ch))
7248 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007250 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251}
7252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007256Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007257at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258
7259static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007260unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261{
7262 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7263 register const Py_UNICODE *e;
7264 int cased;
7265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 /* Shortcut for single character strings */
7267 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007270 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007271 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007273
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 e = p + PyUnicode_GET_SIZE(self);
7275 cased = 0;
7276 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007278
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7280 return PyBool_FromLong(0);
7281 else if (!cased && Py_UNICODE_ISUPPER(ch))
7282 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007284 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285}
7286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007287PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007290Return True if S is a titlecased string and there is at least one\n\
7291character in S, i.e. upper- and titlecase characters may only\n\
7292follow uncased characters and lowercase characters only cased ones.\n\
7293Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007296unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297{
7298 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7299 register const Py_UNICODE *e;
7300 int cased, previous_is_cased;
7301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 /* Shortcut for single character strings */
7303 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7305 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007307 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007308 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007310
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 e = p + PyUnicode_GET_SIZE(self);
7312 cased = 0;
7313 previous_is_cased = 0;
7314 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007316
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7318 if (previous_is_cased)
7319 return PyBool_FromLong(0);
7320 previous_is_cased = 1;
7321 cased = 1;
7322 }
7323 else if (Py_UNICODE_ISLOWER(ch)) {
7324 if (!previous_is_cased)
7325 return PyBool_FromLong(0);
7326 previous_is_cased = 1;
7327 cased = 1;
7328 }
7329 else
7330 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007332 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007335PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007338Return True if all characters in S are whitespace\n\
7339and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007342unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343{
7344 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7345 register const Py_UNICODE *e;
7346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 /* Shortcut for single character strings */
7348 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 Py_UNICODE_ISSPACE(*p))
7350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007352 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007353 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007355
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 e = p + PyUnicode_GET_SIZE(self);
7357 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 if (!Py_UNICODE_ISSPACE(*p))
7359 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007361 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362}
7363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007364PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007367Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007369
7370static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007371unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007372{
7373 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7374 register const Py_UNICODE *e;
7375
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007376 /* Shortcut for single character strings */
7377 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 Py_UNICODE_ISALPHA(*p))
7379 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007380
7381 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007382 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007384
7385 e = p + PyUnicode_GET_SIZE(self);
7386 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 if (!Py_UNICODE_ISALPHA(*p))
7388 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007390 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007396Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007398
7399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007400unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007401{
7402 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7403 register const Py_UNICODE *e;
7404
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007405 /* Shortcut for single character strings */
7406 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 Py_UNICODE_ISALNUM(*p))
7408 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007409
7410 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007411 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007413
7414 e = p + PyUnicode_GET_SIZE(self);
7415 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 if (!Py_UNICODE_ISALNUM(*p))
7417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007420}
7421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007422PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007425Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007426False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
7428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007429unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430{
7431 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7432 register const Py_UNICODE *e;
7433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 /* Shortcut for single character strings */
7435 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 Py_UNICODE_ISDECIMAL(*p))
7437 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007439 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007440 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007442
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 e = p + PyUnicode_GET_SIZE(self);
7444 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 if (!Py_UNICODE_ISDECIMAL(*p))
7446 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007448 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449}
7450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007451PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007454Return True if all characters in S are digits\n\
7455and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456
7457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007458unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459{
7460 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7461 register const Py_UNICODE *e;
7462
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 /* Shortcut for single character strings */
7464 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 Py_UNICODE_ISDIGIT(*p))
7466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007468 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007469 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007471
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 e = p + PyUnicode_GET_SIZE(self);
7473 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 if (!Py_UNICODE_ISDIGIT(*p))
7475 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007477 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478}
7479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007480PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007483Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
7486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007487unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488{
7489 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7490 register const Py_UNICODE *e;
7491
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 /* Shortcut for single character strings */
7493 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 Py_UNICODE_ISNUMERIC(*p))
7495 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007497 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007498 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007500
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 e = p + PyUnicode_GET_SIZE(self);
7502 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 if (!Py_UNICODE_ISNUMERIC(*p))
7504 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007506 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507}
7508
Martin v. Löwis47383402007-08-15 07:32:56 +00007509int
7510PyUnicode_IsIdentifier(PyObject *self)
7511{
7512 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7513 register const Py_UNICODE *e;
7514
7515 /* Special case for empty strings */
7516 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007518
7519 /* PEP 3131 says that the first character must be in
7520 XID_Start and subsequent characters in XID_Continue,
7521 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007522 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007523 letters, digits, underscore). However, given the current
7524 definition of XID_Start and XID_Continue, it is sufficient
7525 to check just for these, except that _ must be allowed
7526 as starting an identifier. */
7527 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7528 return 0;
7529
7530 e = p + PyUnicode_GET_SIZE(self);
7531 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 if (!_PyUnicode_IsXidContinue(*p))
7533 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007534 }
7535 return 1;
7536}
7537
7538PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007540\n\
7541Return True if S is a valid identifier according\n\
7542to the language definition.");
7543
7544static PyObject*
7545unicode_isidentifier(PyObject *self)
7546{
7547 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7548}
7549
Georg Brandl559e5d72008-06-11 18:37:52 +00007550PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007552\n\
7553Return True if all characters in S are considered\n\
7554printable in repr() or S is empty, False otherwise.");
7555
7556static PyObject*
7557unicode_isprintable(PyObject *self)
7558{
7559 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7560 register const Py_UNICODE *e;
7561
7562 /* Shortcut for single character strings */
7563 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7564 Py_RETURN_TRUE;
7565 }
7566
7567 e = p + PyUnicode_GET_SIZE(self);
7568 for (; p < e; p++) {
7569 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7570 Py_RETURN_FALSE;
7571 }
7572 }
7573 Py_RETURN_TRUE;
7574}
7575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007577 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578\n\
7579Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007580iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
7582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007583unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007585 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586}
7587
Martin v. Löwis18e16552006-02-15 17:27:45 +00007588static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589unicode_length(PyUnicodeObject *self)
7590{
7591 return self->length;
7592}
7593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007594PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007597Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007598done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
7600static PyObject *
7601unicode_ljust(PyUnicodeObject *self, PyObject *args)
7602{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007603 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007604 Py_UNICODE fillchar = ' ';
7605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007606 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return NULL;
7608
Tim Peters7a29bd52001-09-12 03:03:31 +00007609 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 Py_INCREF(self);
7611 return (PyObject*) self;
7612 }
7613
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007614 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615}
7616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 return fixup(self, fixlower);
7626}
7627
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007628#define LEFTSTRIP 0
7629#define RIGHTSTRIP 1
7630#define BOTHSTRIP 2
7631
7632/* Arrays indexed by above */
7633static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7634
7635#define STRIPNAME(i) (stripformat[i]+3)
7636
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007637/* externally visible for str.strip(unicode) */
7638PyObject *
7639_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7640{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007641 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7642 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7643 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7644 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7645 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007646
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007648
Benjamin Peterson14339b62009-01-31 16:36:08 +00007649 i = 0;
7650 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7652 i++;
7653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007654 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007655
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656 j = len;
7657 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 do {
7659 j--;
7660 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7661 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007663
Benjamin Peterson14339b62009-01-31 16:36:08 +00007664 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 Py_INCREF(self);
7666 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007667 }
7668 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007670}
7671
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
7673static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007674do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7677 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007678
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679 i = 0;
7680 if (striptype != RIGHTSTRIP) {
7681 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7682 i++;
7683 }
7684 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007685
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 j = len;
7687 if (striptype != LEFTSTRIP) {
7688 do {
7689 j--;
7690 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7691 j++;
7692 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007693
Benjamin Peterson14339b62009-01-31 16:36:08 +00007694 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7695 Py_INCREF(self);
7696 return (PyObject*)self;
7697 }
7698 else
7699 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700}
7701
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702
7703static PyObject *
7704do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007706 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007707
Benjamin Peterson14339b62009-01-31 16:36:08 +00007708 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7709 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007710
Benjamin Peterson14339b62009-01-31 16:36:08 +00007711 if (sep != NULL && sep != Py_None) {
7712 if (PyUnicode_Check(sep))
7713 return _PyUnicode_XStrip(self, striptype, sep);
7714 else {
7715 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 "%s arg must be None or str",
7717 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 return NULL;
7719 }
7720 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007723}
7724
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007728\n\
7729Return a copy of the string S with leading and trailing\n\
7730whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007731If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007732
7733static PyObject *
7734unicode_strip(PyUnicodeObject *self, PyObject *args)
7735{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 if (PyTuple_GET_SIZE(args) == 0)
7737 return do_strip(self, BOTHSTRIP); /* Common case */
7738 else
7739 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007740}
7741
7742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007743PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007745\n\
7746Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007747If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007748
7749static PyObject *
7750unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7751{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 if (PyTuple_GET_SIZE(args) == 0)
7753 return do_strip(self, LEFTSTRIP); /* Common case */
7754 else
7755 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007756}
7757
7758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007761\n\
7762Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007763If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007764
7765static PyObject *
7766unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7767{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007768 if (PyTuple_GET_SIZE(args) == 0)
7769 return do_strip(self, RIGHTSTRIP); /* Common case */
7770 else
7771 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007772}
7773
7774
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777{
7778 PyUnicodeObject *u;
7779 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007780 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007781 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
Georg Brandl222de0f2009-04-12 12:01:50 +00007783 if (len < 1) {
7784 Py_INCREF(unicode_empty);
7785 return (PyObject *)unicode_empty;
7786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
Tim Peters7a29bd52001-09-12 03:03:31 +00007788 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 /* no repeat, return original string */
7790 Py_INCREF(str);
7791 return (PyObject*) str;
7792 }
Tim Peters8f422462000-09-09 06:13:41 +00007793
7794 /* ensure # of chars needed doesn't overflow int and # of bytes
7795 * needed doesn't overflow size_t
7796 */
7797 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007798 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007799 PyErr_SetString(PyExc_OverflowError,
7800 "repeated string is too long");
7801 return NULL;
7802 }
7803 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7804 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7805 PyErr_SetString(PyExc_OverflowError,
7806 "repeated string is too long");
7807 return NULL;
7808 }
7809 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 if (!u)
7811 return NULL;
7812
7813 p = u->str;
7814
Georg Brandl222de0f2009-04-12 12:01:50 +00007815 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007816 Py_UNICODE_FILL(p, str->str[0], len);
7817 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007818 Py_ssize_t done = str->length; /* number of characters copied this far */
7819 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007821 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007822 Py_UNICODE_COPY(p+done, p, n);
7823 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 }
7826
7827 return (PyObject*) u;
7828}
7829
7830PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 PyObject *subobj,
7832 PyObject *replobj,
7833 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834{
7835 PyObject *self;
7836 PyObject *str1;
7837 PyObject *str2;
7838 PyObject *result;
7839
7840 self = PyUnicode_FromObject(obj);
7841 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 str1 = PyUnicode_FromObject(subobj);
7844 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 Py_DECREF(self);
7846 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 }
7848 str2 = PyUnicode_FromObject(replobj);
7849 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 Py_DECREF(self);
7851 Py_DECREF(str1);
7852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 }
Tim Petersced69f82003-09-16 20:30:58 +00007854 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 (PyUnicodeObject *)str1,
7856 (PyUnicodeObject *)str2,
7857 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 Py_DECREF(self);
7859 Py_DECREF(str1);
7860 Py_DECREF(str2);
7861 return result;
7862}
7863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007864PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866\n\
7867Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007868old replaced by new. If the optional argument count is\n\
7869given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
7871static PyObject*
7872unicode_replace(PyUnicodeObject *self, PyObject *args)
7873{
7874 PyUnicodeObject *str1;
7875 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007876 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 PyObject *result;
7878
Martin v. Löwis18e16552006-02-15 17:27:45 +00007879 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 return NULL;
7881 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7882 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007885 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 Py_DECREF(str1);
7887 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
7890 result = replace(self, str1, str2, maxcount);
7891
7892 Py_DECREF(str1);
7893 Py_DECREF(str2);
7894 return result;
7895}
7896
7897static
7898PyObject *unicode_repr(PyObject *unicode)
7899{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007900 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007901 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007902 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7903 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7904
7905 /* XXX(nnorwitz): rather than over-allocating, it would be
7906 better to choose a different scheme. Perhaps scan the
7907 first N-chars of the string and allocate based on that size.
7908 */
7909 /* Initial allocation is based on the longest-possible unichr
7910 escape.
7911
7912 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7913 unichr, so in this case it's the longest unichr escape. In
7914 narrow (UTF-16) builds this is five chars per source unichr
7915 since there are two unichrs in the surrogate pair, so in narrow
7916 (UTF-16) builds it's not the longest unichr escape.
7917
7918 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7919 so in the narrow (UTF-16) build case it's the longest unichr
7920 escape.
7921 */
7922
Walter Dörwald1ab83302007-05-18 17:15:44 +00007923 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007925#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007927#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007929#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007931 if (repr == NULL)
7932 return NULL;
7933
Walter Dörwald1ab83302007-05-18 17:15:44 +00007934 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007935
7936 /* Add quote */
7937 *p++ = (findchar(s, size, '\'') &&
7938 !findchar(s, size, '"')) ? '"' : '\'';
7939 while (size-- > 0) {
7940 Py_UNICODE ch = *s++;
7941
7942 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007943 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007944 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007945 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007946 continue;
7947 }
7948
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007950 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007951 *p++ = '\\';
7952 *p++ = 't';
7953 }
7954 else if (ch == '\n') {
7955 *p++ = '\\';
7956 *p++ = 'n';
7957 }
7958 else if (ch == '\r') {
7959 *p++ = '\\';
7960 *p++ = 'r';
7961 }
7962
7963 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007964 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007965 *p++ = '\\';
7966 *p++ = 'x';
7967 *p++ = hexdigits[(ch >> 4) & 0x000F];
7968 *p++ = hexdigits[ch & 0x000F];
7969 }
7970
Georg Brandl559e5d72008-06-11 18:37:52 +00007971 /* Copy ASCII characters as-is */
7972 else if (ch < 0x7F) {
7973 *p++ = ch;
7974 }
7975
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007977 else {
7978 Py_UCS4 ucs = ch;
7979
7980#ifndef Py_UNICODE_WIDE
7981 Py_UNICODE ch2 = 0;
7982 /* Get code point from surrogate pair */
7983 if (size > 0) {
7984 ch2 = *s;
7985 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007989 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007990 size--;
7991 }
7992 }
7993#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007994 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007995 (categories Z* and C* except ASCII space)
7996 */
7997 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7998 /* Map 8-bit characters to '\xhh' */
7999 if (ucs <= 0xff) {
8000 *p++ = '\\';
8001 *p++ = 'x';
8002 *p++ = hexdigits[(ch >> 4) & 0x000F];
8003 *p++ = hexdigits[ch & 0x000F];
8004 }
8005 /* Map 21-bit characters to '\U00xxxxxx' */
8006 else if (ucs >= 0x10000) {
8007 *p++ = '\\';
8008 *p++ = 'U';
8009 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8010 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8011 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8012 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8013 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8014 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8015 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8016 *p++ = hexdigits[ucs & 0x0000000F];
8017 }
8018 /* Map 16-bit characters to '\uxxxx' */
8019 else {
8020 *p++ = '\\';
8021 *p++ = 'u';
8022 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8023 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8024 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8025 *p++ = hexdigits[ucs & 0x000F];
8026 }
8027 }
8028 /* Copy characters as-is */
8029 else {
8030 *p++ = ch;
8031#ifndef Py_UNICODE_WIDE
8032 if (ucs >= 0x10000)
8033 *p++ = ch2;
8034#endif
8035 }
8036 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008037 }
8038 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008039 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008040
8041 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008042 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008043 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044}
8045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008046PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048\n\
8049Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008050such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051arguments start and end are interpreted as in slice notation.\n\
8052\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008053Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
8055static PyObject *
8056unicode_rfind(PyUnicodeObject *self, PyObject *args)
8057{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008058 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008059 Py_ssize_t start;
8060 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008061 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
Christian Heimes9cd17752007-11-18 19:35:23 +00008063 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065
Thomas Wouters477c8d52006-05-27 19:21:47 +00008066 result = stringlib_rfind_slice(
8067 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8068 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8069 start, end
8070 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008073
Christian Heimes217cfd12007-12-02 14:31:20 +00008074 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075}
8076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008077PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008080Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
8082static PyObject *
8083unicode_rindex(PyUnicodeObject *self, PyObject *args)
8084{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008086 Py_ssize_t start;
8087 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008088 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089
Christian Heimes9cd17752007-11-18 19:35:23 +00008090 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092
Thomas Wouters477c8d52006-05-27 19:21:47 +00008093 result = stringlib_rfind_slice(
8094 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8095 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8096 start, end
8097 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098
8099 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008100
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 if (result < 0) {
8102 PyErr_SetString(PyExc_ValueError, "substring not found");
8103 return NULL;
8104 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008105 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106}
8107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008108PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008111Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008112done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113
8114static PyObject *
8115unicode_rjust(PyUnicodeObject *self, PyObject *args)
8116{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008117 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008118 Py_UNICODE fillchar = ' ';
8119
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008120 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 return NULL;
8122
Tim Peters7a29bd52001-09-12 03:03:31 +00008123 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 Py_INCREF(self);
8125 return (PyObject*) self;
8126 }
8127
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008128 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129}
8130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 PyObject *sep,
8133 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134{
8135 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008136
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 s = PyUnicode_FromObject(s);
8138 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 if (sep != NULL) {
8141 sep = PyUnicode_FromObject(sep);
8142 if (sep == NULL) {
8143 Py_DECREF(s);
8144 return NULL;
8145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 }
8147
8148 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8149
8150 Py_DECREF(s);
8151 Py_XDECREF(sep);
8152 return result;
8153}
8154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008155PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157\n\
8158Return a list of the words in S, using sep as the\n\
8159delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008160splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008161whitespace string is a separator and empty strings are\n\
8162removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163
8164static PyObject*
8165unicode_split(PyUnicodeObject *self, PyObject *args)
8166{
8167 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008168 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 return NULL;
8172
8173 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179}
8180
Thomas Wouters477c8d52006-05-27 19:21:47 +00008181PyObject *
8182PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8183{
8184 PyObject* str_obj;
8185 PyObject* sep_obj;
8186 PyObject* out;
8187
8188 str_obj = PyUnicode_FromObject(str_in);
8189 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008191 sep_obj = PyUnicode_FromObject(sep_in);
8192 if (!sep_obj) {
8193 Py_DECREF(str_obj);
8194 return NULL;
8195 }
8196
8197 out = stringlib_partition(
8198 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8199 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8200 );
8201
8202 Py_DECREF(sep_obj);
8203 Py_DECREF(str_obj);
8204
8205 return out;
8206}
8207
8208
8209PyObject *
8210PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8211{
8212 PyObject* str_obj;
8213 PyObject* sep_obj;
8214 PyObject* out;
8215
8216 str_obj = PyUnicode_FromObject(str_in);
8217 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008219 sep_obj = PyUnicode_FromObject(sep_in);
8220 if (!sep_obj) {
8221 Py_DECREF(str_obj);
8222 return NULL;
8223 }
8224
8225 out = stringlib_rpartition(
8226 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8227 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8228 );
8229
8230 Py_DECREF(sep_obj);
8231 Py_DECREF(str_obj);
8232
8233 return out;
8234}
8235
8236PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008239Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008240the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008241found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008242
8243static PyObject*
8244unicode_partition(PyUnicodeObject *self, PyObject *separator)
8245{
8246 return PyUnicode_Partition((PyObject *)self, separator);
8247}
8248
8249PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008250 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008251\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008252Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008254separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008255
8256static PyObject*
8257unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8258{
8259 return PyUnicode_RPartition((PyObject *)self, separator);
8260}
8261
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008262PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 PyObject *sep,
8264 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008265{
8266 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008268 s = PyUnicode_FromObject(s);
8269 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 if (sep != NULL) {
8272 sep = PyUnicode_FromObject(sep);
8273 if (sep == NULL) {
8274 Py_DECREF(s);
8275 return NULL;
8276 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008277 }
8278
8279 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8280
8281 Py_DECREF(s);
8282 Py_XDECREF(sep);
8283 return result;
8284}
8285
8286PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008288\n\
8289Return a list of the words in S, using sep as the\n\
8290delimiter string, starting at the end of the string and\n\
8291working to the front. If maxsplit is given, at most maxsplit\n\
8292splits are done. If sep is not specified, any whitespace string\n\
8293is a separator.");
8294
8295static PyObject*
8296unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8297{
8298 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008300
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008302 return NULL;
8303
8304 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008306 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008308 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008310}
8311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008312PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314\n\
8315Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008316Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008317is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318
8319static PyObject*
8320unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8321{
Guido van Rossum86662912000-04-11 15:38:46 +00008322 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Guido van Rossum86662912000-04-11 15:38:46 +00008324 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 return NULL;
8326
Guido van Rossum86662912000-04-11 15:38:46 +00008327 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328}
8329
8330static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008331PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
Walter Dörwald346737f2007-05-31 10:44:43 +00008333 if (PyUnicode_CheckExact(self)) {
8334 Py_INCREF(self);
8335 return self;
8336 } else
8337 /* Subtype -- return genuine unicode string with the same value. */
8338 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8339 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340}
8341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008342PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344\n\
8345Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008346and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347
8348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008349unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 return fixup(self, fixswapcase);
8352}
8353
Georg Brandlceee0772007-11-27 23:48:05 +00008354PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008356\n\
8357Return a translation table usable for str.translate().\n\
8358If there is only one argument, it must be a dictionary mapping Unicode\n\
8359ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008360Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008361If there are two arguments, they must be strings of equal length, and\n\
8362in the resulting dictionary, each character in x will be mapped to the\n\
8363character at the same position in y. If there is a third argument, it\n\
8364must be a string, whose characters will be mapped to None in the result.");
8365
8366static PyObject*
8367unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8368{
8369 PyObject *x, *y = NULL, *z = NULL;
8370 PyObject *new = NULL, *key, *value;
8371 Py_ssize_t i = 0;
8372 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008373
Georg Brandlceee0772007-11-27 23:48:05 +00008374 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8375 return NULL;
8376 new = PyDict_New();
8377 if (!new)
8378 return NULL;
8379 if (y != NULL) {
8380 /* x must be a string too, of equal length */
8381 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8382 if (!PyUnicode_Check(x)) {
8383 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8384 "be a string if there is a second argument");
8385 goto err;
8386 }
8387 if (PyUnicode_GET_SIZE(x) != ylen) {
8388 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8389 "arguments must have equal length");
8390 goto err;
8391 }
8392 /* create entries for translating chars in x to those in y */
8393 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008394 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8395 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008396 if (!key || !value)
8397 goto err;
8398 res = PyDict_SetItem(new, key, value);
8399 Py_DECREF(key);
8400 Py_DECREF(value);
8401 if (res < 0)
8402 goto err;
8403 }
8404 /* create entries for deleting chars in z */
8405 if (z != NULL) {
8406 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008407 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008408 if (!key)
8409 goto err;
8410 res = PyDict_SetItem(new, key, Py_None);
8411 Py_DECREF(key);
8412 if (res < 0)
8413 goto err;
8414 }
8415 }
8416 } else {
8417 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008418 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008419 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8420 "to maketrans it must be a dict");
8421 goto err;
8422 }
8423 /* copy entries into the new dict, converting string keys to int keys */
8424 while (PyDict_Next(x, &i, &key, &value)) {
8425 if (PyUnicode_Check(key)) {
8426 /* convert string keys to integer keys */
8427 PyObject *newkey;
8428 if (PyUnicode_GET_SIZE(key) != 1) {
8429 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8430 "table must be of length 1");
8431 goto err;
8432 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008433 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008434 if (!newkey)
8435 goto err;
8436 res = PyDict_SetItem(new, newkey, value);
8437 Py_DECREF(newkey);
8438 if (res < 0)
8439 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008440 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008441 /* just keep integer keys */
8442 if (PyDict_SetItem(new, key, value) < 0)
8443 goto err;
8444 } else {
8445 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8446 "be strings or integers");
8447 goto err;
8448 }
8449 }
8450 }
8451 return new;
8452 err:
8453 Py_DECREF(new);
8454 return NULL;
8455}
8456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008457PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459\n\
8460Return a copy of the string S, where all characters have been mapped\n\
8461through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008462Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008463Unmapped characters are left untouched. Characters mapped to None\n\
8464are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465
8466static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008467unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468{
Georg Brandlceee0772007-11-27 23:48:05 +00008469 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470}
8471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008472PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008475Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
8477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008478unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 return fixup(self, fixupper);
8481}
8482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008486Pad a numeric string S with zeros on the left, to fill a field\n\
8487of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
8489static PyObject *
8490unicode_zfill(PyUnicodeObject *self, PyObject *args)
8491{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 PyUnicodeObject *u;
8494
Martin v. Löwis18e16552006-02-15 17:27:45 +00008495 Py_ssize_t width;
8496 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 return NULL;
8498
8499 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008500 if (PyUnicode_CheckExact(self)) {
8501 Py_INCREF(self);
8502 return (PyObject*) self;
8503 }
8504 else
8505 return PyUnicode_FromUnicode(
8506 PyUnicode_AS_UNICODE(self),
8507 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 }
8510
8511 fill = width - self->length;
8512
8513 u = pad(self, fill, 0, '0');
8514
Walter Dörwald068325e2002-04-15 13:36:47 +00008515 if (u == NULL)
8516 return NULL;
8517
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 if (u->str[fill] == '+' || u->str[fill] == '-') {
8519 /* move sign to beginning of string */
8520 u->str[0] = u->str[fill];
8521 u->str[fill] = '0';
8522 }
8523
8524 return (PyObject*) u;
8525}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526
8527#if 0
8528static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008529unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530{
Christian Heimes2202f872008-02-06 14:31:34 +00008531 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532}
8533#endif
8534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008535PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008538Return True if S starts with the specified prefix, False otherwise.\n\
8539With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008540With optional end, stop comparing S at that position.\n\
8541prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
8543static PyObject *
8544unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008547 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008550 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008553 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8555 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008556 if (PyTuple_Check(subobj)) {
8557 Py_ssize_t i;
8558 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8559 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008561 if (substring == NULL)
8562 return NULL;
8563 result = tailmatch(self, substring, start, end, -1);
8564 Py_DECREF(substring);
8565 if (result) {
8566 Py_RETURN_TRUE;
8567 }
8568 }
8569 /* nothing matched */
8570 Py_RETURN_FALSE;
8571 }
8572 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008577 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578}
8579
8580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008581PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008584Return True if S ends with the specified suffix, False otherwise.\n\
8585With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008586With optional end, stop comparing S at that position.\n\
8587suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
8589static PyObject *
8590unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008593 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008595 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008596 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008597 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008599 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8601 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008602 if (PyTuple_Check(subobj)) {
8603 Py_ssize_t i;
8604 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8605 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008607 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008609 result = tailmatch(self, substring, start, end, +1);
8610 Py_DECREF(substring);
8611 if (result) {
8612 Py_RETURN_TRUE;
8613 }
8614 }
8615 Py_RETURN_FALSE;
8616 }
8617 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008621 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008623 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
8625
Eric Smith8c663262007-08-25 02:26:07 +00008626#include "stringlib/string_format.h"
8627
8628PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008630\n\
8631");
8632
Eric Smith4a7d76d2008-05-30 18:10:19 +00008633static PyObject *
8634unicode__format__(PyObject* self, PyObject* args)
8635{
8636 PyObject *format_spec;
8637
8638 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8639 return NULL;
8640
8641 return _PyUnicode_FormatAdvanced(self,
8642 PyUnicode_AS_UNICODE(format_spec),
8643 PyUnicode_GET_SIZE(format_spec));
8644}
8645
Eric Smith8c663262007-08-25 02:26:07 +00008646PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008648\n\
8649");
8650
8651static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008652unicode__sizeof__(PyUnicodeObject *v)
8653{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008654 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8655 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008656}
8657
8658PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008660
8661static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008662unicode_getnewargs(PyUnicodeObject *v)
8663{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008664 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008665}
8666
8667
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668static PyMethodDef unicode_methods[] = {
8669
8670 /* Order is according to common usage: often used methods should
8671 appear first, since lookup is done sequentially. */
8672
Benjamin Peterson308d6372009-09-18 21:42:35 +00008673 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008674 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8675 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008676 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008677 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8678 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8679 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8680 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8681 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8682 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8683 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008684 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008685 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8686 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8687 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008688 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008689 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8690 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8691 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008692 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008693 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008694 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008695 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008696 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8697 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8698 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8699 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8700 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8701 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8702 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8703 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8704 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8705 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8706 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8707 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8708 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8709 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008710 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008711 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008712 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008713 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008714 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008715 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8716 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008717 {"maketrans", (PyCFunction) unicode_maketrans,
8718 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008719 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008720#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008721 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722#endif
8723
8724#if 0
8725 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008726 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727#endif
8728
Benjamin Peterson14339b62009-01-31 16:36:08 +00008729 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 {NULL, NULL}
8731};
8732
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008733static PyObject *
8734unicode_mod(PyObject *v, PyObject *w)
8735{
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 if (!PyUnicode_Check(v)) {
8737 Py_INCREF(Py_NotImplemented);
8738 return Py_NotImplemented;
8739 }
8740 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008741}
8742
8743static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 0, /*nb_add*/
8745 0, /*nb_subtract*/
8746 0, /*nb_multiply*/
8747 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008748};
8749
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008751 (lenfunc) unicode_length, /* sq_length */
8752 PyUnicode_Concat, /* sq_concat */
8753 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8754 (ssizeargfunc) unicode_getitem, /* sq_item */
8755 0, /* sq_slice */
8756 0, /* sq_ass_item */
8757 0, /* sq_ass_slice */
8758 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759};
8760
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008761static PyObject*
8762unicode_subscript(PyUnicodeObject* self, PyObject* item)
8763{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008764 if (PyIndex_Check(item)) {
8765 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008766 if (i == -1 && PyErr_Occurred())
8767 return NULL;
8768 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008769 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008770 return unicode_getitem(self, i);
8771 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008772 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008773 Py_UNICODE* source_buf;
8774 Py_UNICODE* result_buf;
8775 PyObject* result;
8776
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008777 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008779 return NULL;
8780 }
8781
8782 if (slicelength <= 0) {
8783 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008784 } else if (start == 0 && step == 1 && slicelength == self->length &&
8785 PyUnicode_CheckExact(self)) {
8786 Py_INCREF(self);
8787 return (PyObject *)self;
8788 } else if (step == 1) {
8789 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008790 } else {
8791 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008792 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8793 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 if (result_buf == NULL)
8796 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008797
8798 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8799 result_buf[i] = source_buf[cur];
8800 }
Tim Petersced69f82003-09-16 20:30:58 +00008801
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008802 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008803 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008804 return result;
8805 }
8806 } else {
8807 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8808 return NULL;
8809 }
8810}
8811
8812static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813 (lenfunc)unicode_length, /* mp_length */
8814 (binaryfunc)unicode_subscript, /* mp_subscript */
8815 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008816};
8817
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819/* Helpers for PyUnicode_Format() */
8820
8821static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008822getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008824 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 (*p_argidx)++;
8827 if (arglen < 0)
8828 return args;
8829 else
8830 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 }
8832 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 return NULL;
8835}
8836
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008837/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008839static PyObject *
8840formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008842 char *p;
8843 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008845
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 x = PyFloat_AsDouble(v);
8847 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008848 return NULL;
8849
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008852
Eric Smith0923d1d2009-04-16 20:16:10 +00008853 p = PyOS_double_to_string(x, type, prec,
8854 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008855 if (p == NULL)
8856 return NULL;
8857 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008858 PyMem_Free(p);
8859 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860}
8861
Tim Peters38fd5b62000-09-21 05:43:11 +00008862static PyObject*
8863formatlong(PyObject *val, int flags, int prec, int type)
8864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008865 char *buf;
8866 int len;
8867 PyObject *str; /* temporary string object. */
8868 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008869
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8871 if (!str)
8872 return NULL;
8873 result = PyUnicode_FromStringAndSize(buf, len);
8874 Py_DECREF(str);
8875 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008876}
8877
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878static int
8879formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008880 size_t buflen,
8881 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008883 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008884 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 if (PyUnicode_GET_SIZE(v) == 1) {
8886 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8887 buf[1] = '\0';
8888 return 1;
8889 }
8890#ifndef Py_UNICODE_WIDE
8891 if (PyUnicode_GET_SIZE(v) == 2) {
8892 /* Decode a valid surrogate pair */
8893 int c0 = PyUnicode_AS_UNICODE(v)[0];
8894 int c1 = PyUnicode_AS_UNICODE(v)[1];
8895 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8896 0xDC00 <= c1 && c1 <= 0xDFFF) {
8897 buf[0] = c0;
8898 buf[1] = c1;
8899 buf[2] = '\0';
8900 return 2;
8901 }
8902 }
8903#endif
8904 goto onError;
8905 }
8906 else {
8907 /* Integer input truncated to a character */
8908 long x;
8909 x = PyLong_AsLong(v);
8910 if (x == -1 && PyErr_Occurred())
8911 goto onError;
8912
8913 if (x < 0 || x > 0x10ffff) {
8914 PyErr_SetString(PyExc_OverflowError,
8915 "%c arg not in range(0x110000)");
8916 return -1;
8917 }
8918
8919#ifndef Py_UNICODE_WIDE
8920 if (x > 0xffff) {
8921 x -= 0x10000;
8922 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8923 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8924 return 2;
8925 }
8926#endif
8927 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008928 buf[1] = '\0';
8929 return 1;
8930 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008931
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008933 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008935 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936}
8937
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008938/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008939 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008940*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008941#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945{
8946 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008947 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 int args_owned = 0;
8949 PyUnicodeObject *result = NULL;
8950 PyObject *dict = NULL;
8951 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008952
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 PyErr_BadInternalCall();
8955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 }
8957 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008958 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 fmt = PyUnicode_AS_UNICODE(uformat);
8961 fmtcnt = PyUnicode_GET_SIZE(uformat);
8962
8963 reslen = rescnt = fmtcnt + 100;
8964 result = _PyUnicode_New(reslen);
8965 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 res = PyUnicode_AS_UNICODE(result);
8968
8969 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 arglen = PyTuple_Size(args);
8971 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 }
8973 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 arglen = -1;
8975 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008977 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008978 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
8981 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 if (*fmt != '%') {
8983 if (--rescnt < 0) {
8984 rescnt = fmtcnt + 100;
8985 reslen += rescnt;
8986 if (_PyUnicode_Resize(&result, reslen) < 0)
8987 goto onError;
8988 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8989 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008990 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008992 }
8993 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 /* Got a format specifier */
8995 int flags = 0;
8996 Py_ssize_t width = -1;
8997 int prec = -1;
8998 Py_UNICODE c = '\0';
8999 Py_UNICODE fill;
9000 int isnumok;
9001 PyObject *v = NULL;
9002 PyObject *temp = NULL;
9003 Py_UNICODE *pbuf;
9004 Py_UNICODE sign;
9005 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009006 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 fmt++;
9009 if (*fmt == '(') {
9010 Py_UNICODE *keystart;
9011 Py_ssize_t keylen;
9012 PyObject *key;
9013 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009014
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 if (dict == NULL) {
9016 PyErr_SetString(PyExc_TypeError,
9017 "format requires a mapping");
9018 goto onError;
9019 }
9020 ++fmt;
9021 --fmtcnt;
9022 keystart = fmt;
9023 /* Skip over balanced parentheses */
9024 while (pcount > 0 && --fmtcnt >= 0) {
9025 if (*fmt == ')')
9026 --pcount;
9027 else if (*fmt == '(')
9028 ++pcount;
9029 fmt++;
9030 }
9031 keylen = fmt - keystart - 1;
9032 if (fmtcnt < 0 || pcount > 0) {
9033 PyErr_SetString(PyExc_ValueError,
9034 "incomplete format key");
9035 goto onError;
9036 }
9037#if 0
9038 /* keys are converted to strings using UTF-8 and
9039 then looked up since Python uses strings to hold
9040 variables names etc. in its namespaces and we
9041 wouldn't want to break common idioms. */
9042 key = PyUnicode_EncodeUTF8(keystart,
9043 keylen,
9044 NULL);
9045#else
9046 key = PyUnicode_FromUnicode(keystart, keylen);
9047#endif
9048 if (key == NULL)
9049 goto onError;
9050 if (args_owned) {
9051 Py_DECREF(args);
9052 args_owned = 0;
9053 }
9054 args = PyObject_GetItem(dict, key);
9055 Py_DECREF(key);
9056 if (args == NULL) {
9057 goto onError;
9058 }
9059 args_owned = 1;
9060 arglen = -1;
9061 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 while (--fmtcnt >= 0) {
9064 switch (c = *fmt++) {
9065 case '-': flags |= F_LJUST; continue;
9066 case '+': flags |= F_SIGN; continue;
9067 case ' ': flags |= F_BLANK; continue;
9068 case '#': flags |= F_ALT; continue;
9069 case '0': flags |= F_ZERO; continue;
9070 }
9071 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009072 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 if (c == '*') {
9074 v = getnextarg(args, arglen, &argidx);
9075 if (v == NULL)
9076 goto onError;
9077 if (!PyLong_Check(v)) {
9078 PyErr_SetString(PyExc_TypeError,
9079 "* wants int");
9080 goto onError;
9081 }
9082 width = PyLong_AsLong(v);
9083 if (width == -1 && PyErr_Occurred())
9084 goto onError;
9085 if (width < 0) {
9086 flags |= F_LJUST;
9087 width = -width;
9088 }
9089 if (--fmtcnt >= 0)
9090 c = *fmt++;
9091 }
9092 else if (c >= '0' && c <= '9') {
9093 width = c - '0';
9094 while (--fmtcnt >= 0) {
9095 c = *fmt++;
9096 if (c < '0' || c > '9')
9097 break;
9098 if ((width*10) / 10 != width) {
9099 PyErr_SetString(PyExc_ValueError,
9100 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009101 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 }
9103 width = width*10 + (c - '0');
9104 }
9105 }
9106 if (c == '.') {
9107 prec = 0;
9108 if (--fmtcnt >= 0)
9109 c = *fmt++;
9110 if (c == '*') {
9111 v = getnextarg(args, arglen, &argidx);
9112 if (v == NULL)
9113 goto onError;
9114 if (!PyLong_Check(v)) {
9115 PyErr_SetString(PyExc_TypeError,
9116 "* wants int");
9117 goto onError;
9118 }
9119 prec = PyLong_AsLong(v);
9120 if (prec == -1 && PyErr_Occurred())
9121 goto onError;
9122 if (prec < 0)
9123 prec = 0;
9124 if (--fmtcnt >= 0)
9125 c = *fmt++;
9126 }
9127 else if (c >= '0' && c <= '9') {
9128 prec = c - '0';
9129 while (--fmtcnt >= 0) {
9130 c = Py_CHARMASK(*fmt++);
9131 if (c < '0' || c > '9')
9132 break;
9133 if ((prec*10) / 10 != prec) {
9134 PyErr_SetString(PyExc_ValueError,
9135 "prec too big");
9136 goto onError;
9137 }
9138 prec = prec*10 + (c - '0');
9139 }
9140 }
9141 } /* prec */
9142 if (fmtcnt >= 0) {
9143 if (c == 'h' || c == 'l' || c == 'L') {
9144 if (--fmtcnt >= 0)
9145 c = *fmt++;
9146 }
9147 }
9148 if (fmtcnt < 0) {
9149 PyErr_SetString(PyExc_ValueError,
9150 "incomplete format");
9151 goto onError;
9152 }
9153 if (c != '%') {
9154 v = getnextarg(args, arglen, &argidx);
9155 if (v == NULL)
9156 goto onError;
9157 }
9158 sign = 0;
9159 fill = ' ';
9160 switch (c) {
9161
9162 case '%':
9163 pbuf = formatbuf;
9164 /* presume that buffer length is at least 1 */
9165 pbuf[0] = '%';
9166 len = 1;
9167 break;
9168
9169 case 's':
9170 case 'r':
9171 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009172 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 temp = v;
9174 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009175 }
9176 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 if (c == 's')
9178 temp = PyObject_Str(v);
9179 else if (c == 'r')
9180 temp = PyObject_Repr(v);
9181 else
9182 temp = PyObject_ASCII(v);
9183 if (temp == NULL)
9184 goto onError;
9185 if (PyUnicode_Check(temp))
9186 /* nothing to do */;
9187 else {
9188 Py_DECREF(temp);
9189 PyErr_SetString(PyExc_TypeError,
9190 "%s argument has non-string str()");
9191 goto onError;
9192 }
9193 }
9194 pbuf = PyUnicode_AS_UNICODE(temp);
9195 len = PyUnicode_GET_SIZE(temp);
9196 if (prec >= 0 && len > prec)
9197 len = prec;
9198 break;
9199
9200 case 'i':
9201 case 'd':
9202 case 'u':
9203 case 'o':
9204 case 'x':
9205 case 'X':
9206 if (c == 'i')
9207 c = 'd';
9208 isnumok = 0;
9209 if (PyNumber_Check(v)) {
9210 PyObject *iobj=NULL;
9211
9212 if (PyLong_Check(v)) {
9213 iobj = v;
9214 Py_INCREF(iobj);
9215 }
9216 else {
9217 iobj = PyNumber_Long(v);
9218 }
9219 if (iobj!=NULL) {
9220 if (PyLong_Check(iobj)) {
9221 isnumok = 1;
9222 temp = formatlong(iobj, flags, prec, c);
9223 Py_DECREF(iobj);
9224 if (!temp)
9225 goto onError;
9226 pbuf = PyUnicode_AS_UNICODE(temp);
9227 len = PyUnicode_GET_SIZE(temp);
9228 sign = 1;
9229 }
9230 else {
9231 Py_DECREF(iobj);
9232 }
9233 }
9234 }
9235 if (!isnumok) {
9236 PyErr_Format(PyExc_TypeError,
9237 "%%%c format: a number is required, "
9238 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9239 goto onError;
9240 }
9241 if (flags & F_ZERO)
9242 fill = '0';
9243 break;
9244
9245 case 'e':
9246 case 'E':
9247 case 'f':
9248 case 'F':
9249 case 'g':
9250 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009251 temp = formatfloat(v, flags, prec, c);
9252 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009254 pbuf = PyUnicode_AS_UNICODE(temp);
9255 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 sign = 1;
9257 if (flags & F_ZERO)
9258 fill = '0';
9259 break;
9260
9261 case 'c':
9262 pbuf = formatbuf;
9263 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9264 if (len < 0)
9265 goto onError;
9266 break;
9267
9268 default:
9269 PyErr_Format(PyExc_ValueError,
9270 "unsupported format character '%c' (0x%x) "
9271 "at index %zd",
9272 (31<=c && c<=126) ? (char)c : '?',
9273 (int)c,
9274 (Py_ssize_t)(fmt - 1 -
9275 PyUnicode_AS_UNICODE(uformat)));
9276 goto onError;
9277 }
9278 if (sign) {
9279 if (*pbuf == '-' || *pbuf == '+') {
9280 sign = *pbuf++;
9281 len--;
9282 }
9283 else if (flags & F_SIGN)
9284 sign = '+';
9285 else if (flags & F_BLANK)
9286 sign = ' ';
9287 else
9288 sign = 0;
9289 }
9290 if (width < len)
9291 width = len;
9292 if (rescnt - (sign != 0) < width) {
9293 reslen -= rescnt;
9294 rescnt = width + fmtcnt + 100;
9295 reslen += rescnt;
9296 if (reslen < 0) {
9297 Py_XDECREF(temp);
9298 PyErr_NoMemory();
9299 goto onError;
9300 }
9301 if (_PyUnicode_Resize(&result, reslen) < 0) {
9302 Py_XDECREF(temp);
9303 goto onError;
9304 }
9305 res = PyUnicode_AS_UNICODE(result)
9306 + reslen - rescnt;
9307 }
9308 if (sign) {
9309 if (fill != ' ')
9310 *res++ = sign;
9311 rescnt--;
9312 if (width > len)
9313 width--;
9314 }
9315 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9316 assert(pbuf[0] == '0');
9317 assert(pbuf[1] == c);
9318 if (fill != ' ') {
9319 *res++ = *pbuf++;
9320 *res++ = *pbuf++;
9321 }
9322 rescnt -= 2;
9323 width -= 2;
9324 if (width < 0)
9325 width = 0;
9326 len -= 2;
9327 }
9328 if (width > len && !(flags & F_LJUST)) {
9329 do {
9330 --rescnt;
9331 *res++ = fill;
9332 } while (--width > len);
9333 }
9334 if (fill == ' ') {
9335 if (sign)
9336 *res++ = sign;
9337 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9338 assert(pbuf[0] == '0');
9339 assert(pbuf[1] == c);
9340 *res++ = *pbuf++;
9341 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009342 }
9343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 Py_UNICODE_COPY(res, pbuf, len);
9345 res += len;
9346 rescnt -= len;
9347 while (--width >= len) {
9348 --rescnt;
9349 *res++ = ' ';
9350 }
9351 if (dict && (argidx < arglen) && c != '%') {
9352 PyErr_SetString(PyExc_TypeError,
9353 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009354 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009355 goto onError;
9356 }
9357 Py_XDECREF(temp);
9358 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 } /* until end */
9360 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 PyErr_SetString(PyExc_TypeError,
9362 "not all arguments converted during string formatting");
9363 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 }
9365
Thomas Woutersa96affe2006-03-12 00:29:36 +00009366 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 return (PyObject *)result;
9373
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 Py_XDECREF(result);
9376 Py_DECREF(uformat);
9377 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 }
9380 return NULL;
9381}
9382
Jeremy Hylton938ace62002-07-17 16:30:39 +00009383static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009384unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9385
Tim Peters6d6c1a32001-08-02 04:15:00 +00009386static PyObject *
9387unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9388{
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009390 static char *kwlist[] = {"object", "encoding", "errors", 0};
9391 char *encoding = NULL;
9392 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009393
Benjamin Peterson14339b62009-01-31 16:36:08 +00009394 if (type != &PyUnicode_Type)
9395 return unicode_subtype_new(type, args, kwds);
9396 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009398 return NULL;
9399 if (x == NULL)
9400 return (PyObject *)_PyUnicode_New(0);
9401 if (encoding == NULL && errors == NULL)
9402 return PyObject_Str(x);
9403 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009405}
9406
Guido van Rossume023fe02001-08-30 03:12:59 +00009407static PyObject *
9408unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9409{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009410 PyUnicodeObject *tmp, *pnew;
9411 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009412
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9414 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9415 if (tmp == NULL)
9416 return NULL;
9417 assert(PyUnicode_Check(tmp));
9418 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9419 if (pnew == NULL) {
9420 Py_DECREF(tmp);
9421 return NULL;
9422 }
9423 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9424 if (pnew->str == NULL) {
9425 _Py_ForgetReference((PyObject *)pnew);
9426 PyObject_Del(pnew);
9427 Py_DECREF(tmp);
9428 return PyErr_NoMemory();
9429 }
9430 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9431 pnew->length = n;
9432 pnew->hash = tmp->hash;
9433 Py_DECREF(tmp);
9434 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009435}
9436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009437PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009439\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009440Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009441encoding defaults to the current default string encoding.\n\
9442errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009443
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009444static PyObject *unicode_iter(PyObject *seq);
9445
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009447 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009448 "str", /* tp_name */
9449 sizeof(PyUnicodeObject), /* tp_size */
9450 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009452 (destructor)unicode_dealloc, /* tp_dealloc */
9453 0, /* tp_print */
9454 0, /* tp_getattr */
9455 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009456 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009457 unicode_repr, /* tp_repr */
9458 &unicode_as_number, /* tp_as_number */
9459 &unicode_as_sequence, /* tp_as_sequence */
9460 &unicode_as_mapping, /* tp_as_mapping */
9461 (hashfunc) unicode_hash, /* tp_hash*/
9462 0, /* tp_call*/
9463 (reprfunc) unicode_str, /* tp_str */
9464 PyObject_GenericGetAttr, /* tp_getattro */
9465 0, /* tp_setattro */
9466 0, /* tp_as_buffer */
9467 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009469 unicode_doc, /* tp_doc */
9470 0, /* tp_traverse */
9471 0, /* tp_clear */
9472 PyUnicode_RichCompare, /* tp_richcompare */
9473 0, /* tp_weaklistoffset */
9474 unicode_iter, /* tp_iter */
9475 0, /* tp_iternext */
9476 unicode_methods, /* tp_methods */
9477 0, /* tp_members */
9478 0, /* tp_getset */
9479 &PyBaseObject_Type, /* tp_base */
9480 0, /* tp_dict */
9481 0, /* tp_descr_get */
9482 0, /* tp_descr_set */
9483 0, /* tp_dictoffset */
9484 0, /* tp_init */
9485 0, /* tp_alloc */
9486 unicode_new, /* tp_new */
9487 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488};
9489
9490/* Initialize the Unicode implementation */
9491
Thomas Wouters78890102000-07-22 19:25:51 +00009492void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009494 int i;
9495
Thomas Wouters477c8d52006-05-27 19:21:47 +00009496 /* XXX - move this array to unicodectype.c ? */
9497 Py_UNICODE linebreak[] = {
9498 0x000A, /* LINE FEED */
9499 0x000D, /* CARRIAGE RETURN */
9500 0x001C, /* FILE SEPARATOR */
9501 0x001D, /* GROUP SEPARATOR */
9502 0x001E, /* RECORD SEPARATOR */
9503 0x0085, /* NEXT LINE */
9504 0x2028, /* LINE SEPARATOR */
9505 0x2029, /* PARAGRAPH SEPARATOR */
9506 };
9507
Fred Drakee4315f52000-05-09 19:53:39 +00009508 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009509 free_list = NULL;
9510 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009512 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009514
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009515 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009517 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009519
9520 /* initialize the linebreak bloom filter */
9521 bloom_linebreak = make_bloom_mask(
9522 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9523 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009524
9525 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526}
9527
9528/* Finalize the Unicode implementation */
9529
Christian Heimesa156e092008-02-16 07:38:31 +00009530int
9531PyUnicode_ClearFreeList(void)
9532{
9533 int freelist_size = numfree;
9534 PyUnicodeObject *u;
9535
9536 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 PyUnicodeObject *v = u;
9538 u = *(PyUnicodeObject **)u;
9539 if (v->str)
9540 PyObject_DEL(v->str);
9541 Py_XDECREF(v->defenc);
9542 PyObject_Del(v);
9543 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009544 }
9545 free_list = NULL;
9546 assert(numfree == 0);
9547 return freelist_size;
9548}
9549
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550void
Thomas Wouters78890102000-07-22 19:25:51 +00009551_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009553 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009555 Py_XDECREF(unicode_empty);
9556 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009557
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009558 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009559 if (unicode_latin1[i]) {
9560 Py_DECREF(unicode_latin1[i]);
9561 unicode_latin1[i] = NULL;
9562 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009563 }
Christian Heimesa156e092008-02-16 07:38:31 +00009564 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009566
Walter Dörwald16807132007-05-25 13:52:07 +00009567void
9568PyUnicode_InternInPlace(PyObject **p)
9569{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009570 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9571 PyObject *t;
9572 if (s == NULL || !PyUnicode_Check(s))
9573 Py_FatalError(
9574 "PyUnicode_InternInPlace: unicode strings only please!");
9575 /* If it's a subclass, we don't really know what putting
9576 it in the interned dict might do. */
9577 if (!PyUnicode_CheckExact(s))
9578 return;
9579 if (PyUnicode_CHECK_INTERNED(s))
9580 return;
9581 if (interned == NULL) {
9582 interned = PyDict_New();
9583 if (interned == NULL) {
9584 PyErr_Clear(); /* Don't leave an exception */
9585 return;
9586 }
9587 }
9588 /* It might be that the GetItem call fails even
9589 though the key is present in the dictionary,
9590 namely when this happens during a stack overflow. */
9591 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009594
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 if (t) {
9596 Py_INCREF(t);
9597 Py_DECREF(*p);
9598 *p = t;
9599 return;
9600 }
Walter Dörwald16807132007-05-25 13:52:07 +00009601
Benjamin Peterson14339b62009-01-31 16:36:08 +00009602 PyThreadState_GET()->recursion_critical = 1;
9603 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9604 PyErr_Clear();
9605 PyThreadState_GET()->recursion_critical = 0;
9606 return;
9607 }
9608 PyThreadState_GET()->recursion_critical = 0;
9609 /* The two references in interned are not counted by refcnt.
9610 The deallocator will take care of this */
9611 Py_REFCNT(s) -= 2;
9612 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009613}
9614
9615void
9616PyUnicode_InternImmortal(PyObject **p)
9617{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 PyUnicode_InternInPlace(p);
9619 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9620 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9621 Py_INCREF(*p);
9622 }
Walter Dörwald16807132007-05-25 13:52:07 +00009623}
9624
9625PyObject *
9626PyUnicode_InternFromString(const char *cp)
9627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009628 PyObject *s = PyUnicode_FromString(cp);
9629 if (s == NULL)
9630 return NULL;
9631 PyUnicode_InternInPlace(&s);
9632 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009633}
9634
9635void _Py_ReleaseInternedUnicodeStrings(void)
9636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009637 PyObject *keys;
9638 PyUnicodeObject *s;
9639 Py_ssize_t i, n;
9640 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009641
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 if (interned == NULL || !PyDict_Check(interned))
9643 return;
9644 keys = PyDict_Keys(interned);
9645 if (keys == NULL || !PyList_Check(keys)) {
9646 PyErr_Clear();
9647 return;
9648 }
Walter Dörwald16807132007-05-25 13:52:07 +00009649
Benjamin Peterson14339b62009-01-31 16:36:08 +00009650 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9651 detector, interned unicode strings are not forcibly deallocated;
9652 rather, we give them their stolen references back, and then clear
9653 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009654
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 n = PyList_GET_SIZE(keys);
9656 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009658 for (i = 0; i < n; i++) {
9659 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9660 switch (s->state) {
9661 case SSTATE_NOT_INTERNED:
9662 /* XXX Shouldn't happen */
9663 break;
9664 case SSTATE_INTERNED_IMMORTAL:
9665 Py_REFCNT(s) += 1;
9666 immortal_size += s->length;
9667 break;
9668 case SSTATE_INTERNED_MORTAL:
9669 Py_REFCNT(s) += 2;
9670 mortal_size += s->length;
9671 break;
9672 default:
9673 Py_FatalError("Inconsistent interned string state.");
9674 }
9675 s->state = SSTATE_NOT_INTERNED;
9676 }
9677 fprintf(stderr, "total size of all interned strings: "
9678 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9679 "mortal/immortal\n", mortal_size, immortal_size);
9680 Py_DECREF(keys);
9681 PyDict_Clear(interned);
9682 Py_DECREF(interned);
9683 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009684}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009685
9686
9687/********************* Unicode Iterator **************************/
9688
9689typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690 PyObject_HEAD
9691 Py_ssize_t it_index;
9692 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009693} unicodeiterobject;
9694
9695static void
9696unicodeiter_dealloc(unicodeiterobject *it)
9697{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 _PyObject_GC_UNTRACK(it);
9699 Py_XDECREF(it->it_seq);
9700 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009701}
9702
9703static int
9704unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 Py_VISIT(it->it_seq);
9707 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009708}
9709
9710static PyObject *
9711unicodeiter_next(unicodeiterobject *it)
9712{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009713 PyUnicodeObject *seq;
9714 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009715
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 assert(it != NULL);
9717 seq = it->it_seq;
9718 if (seq == NULL)
9719 return NULL;
9720 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009721
Benjamin Peterson14339b62009-01-31 16:36:08 +00009722 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9723 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 if (item != NULL)
9726 ++it->it_index;
9727 return item;
9728 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009729
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 Py_DECREF(seq);
9731 it->it_seq = NULL;
9732 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009733}
9734
9735static PyObject *
9736unicodeiter_len(unicodeiterobject *it)
9737{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009738 Py_ssize_t len = 0;
9739 if (it->it_seq)
9740 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9741 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009742}
9743
9744PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9745
9746static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009748 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009750};
9751
9752PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009753 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9754 "str_iterator", /* tp_name */
9755 sizeof(unicodeiterobject), /* tp_basicsize */
9756 0, /* tp_itemsize */
9757 /* methods */
9758 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9759 0, /* tp_print */
9760 0, /* tp_getattr */
9761 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009762 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009763 0, /* tp_repr */
9764 0, /* tp_as_number */
9765 0, /* tp_as_sequence */
9766 0, /* tp_as_mapping */
9767 0, /* tp_hash */
9768 0, /* tp_call */
9769 0, /* tp_str */
9770 PyObject_GenericGetAttr, /* tp_getattro */
9771 0, /* tp_setattro */
9772 0, /* tp_as_buffer */
9773 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9774 0, /* tp_doc */
9775 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9776 0, /* tp_clear */
9777 0, /* tp_richcompare */
9778 0, /* tp_weaklistoffset */
9779 PyObject_SelfIter, /* tp_iter */
9780 (iternextfunc)unicodeiter_next, /* tp_iternext */
9781 unicodeiter_methods, /* tp_methods */
9782 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009783};
9784
9785static PyObject *
9786unicode_iter(PyObject *seq)
9787{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009788 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009789
Benjamin Peterson14339b62009-01-31 16:36:08 +00009790 if (!PyUnicode_Check(seq)) {
9791 PyErr_BadInternalCall();
9792 return NULL;
9793 }
9794 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9795 if (it == NULL)
9796 return NULL;
9797 it->it_index = 0;
9798 Py_INCREF(seq);
9799 it->it_seq = (PyUnicodeObject *)seq;
9800 _PyObject_GC_TRACK(it);
9801 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009802}
9803
Martin v. Löwis5b222132007-06-10 09:51:05 +00009804size_t
9805Py_UNICODE_strlen(const Py_UNICODE *u)
9806{
9807 int res = 0;
9808 while(*u++)
9809 res++;
9810 return res;
9811}
9812
9813Py_UNICODE*
9814Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9815{
9816 Py_UNICODE *u = s1;
9817 while ((*u++ = *s2++));
9818 return s1;
9819}
9820
9821Py_UNICODE*
9822Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9823{
9824 Py_UNICODE *u = s1;
9825 while ((*u++ = *s2++))
9826 if (n-- == 0)
9827 break;
9828 return s1;
9829}
9830
9831int
9832Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9833{
9834 while (*s1 && *s2 && *s1 == *s2)
9835 s1++, s2++;
9836 if (*s1 && *s2)
9837 return (*s1 < *s2) ? -1 : +1;
9838 if (*s1)
9839 return 1;
9840 if (*s2)
9841 return -1;
9842 return 0;
9843}
9844
9845Py_UNICODE*
9846Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9847{
9848 const Py_UNICODE *p;
9849 for (p = s; *p; p++)
9850 if (*p == c)
9851 return (Py_UNICODE*)p;
9852 return NULL;
9853}
9854
9855
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009856#ifdef __cplusplus
9857}
9858#endif