blob: 8d75b205de73d16e6720b433432f432b8f8f49ae [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner600d3be2010-06-10 12:00:55 +00001296/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001297 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1298 1 on success. */
1299static int
1300normalize_encoding(const char *encoding,
1301 char *lower,
1302 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001304 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001305 char *l;
1306 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 e = encoding;
1309 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001310 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001311 while (*e) {
1312 if (l == l_end)
1313 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001314 if (ISUPPER(*e)) {
1315 *l++ = TOLOWER(*e++);
1316 }
1317 else if (*e == '_') {
1318 *l++ = '-';
1319 e++;
1320 }
1321 else {
1322 *l++ = *e++;
1323 }
1324 }
1325 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001326 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327}
1328
1329PyObject *PyUnicode_Decode(const char *s,
1330 Py_ssize_t size,
1331 const char *encoding,
1332 const char *errors)
1333{
1334 PyObject *buffer = NULL, *unicode;
1335 Py_buffer info;
1336 char lower[11]; /* Enough for any encoding shortcut */
1337
1338 if (encoding == NULL)
1339 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001340
1341 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001342 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1343 if (strcmp(lower, "utf-8") == 0)
1344 return PyUnicode_DecodeUTF8(s, size, errors);
1345 else if ((strcmp(lower, "latin-1") == 0) ||
1346 (strcmp(lower, "iso-8859-1") == 0))
1347 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001349 else if (strcmp(lower, "mbcs") == 0)
1350 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001351#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001352 else if (strcmp(lower, "ascii") == 0)
1353 return PyUnicode_DecodeASCII(s, size, errors);
1354 else if (strcmp(lower, "utf-16") == 0)
1355 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1356 else if (strcmp(lower, "utf-32") == 0)
1357 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001361 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001362 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001363 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001364 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (buffer == NULL)
1366 goto onError;
1367 unicode = PyCodec_Decode(buffer, encoding, errors);
1368 if (unicode == NULL)
1369 goto onError;
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001372 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 Py_DECREF(unicode);
1375 goto onError;
1376 }
1377 Py_DECREF(buffer);
1378 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 Py_XDECREF(buffer);
1382 return NULL;
1383}
1384
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1386 const char *encoding,
1387 const char *errors)
1388{
1389 PyObject *v;
1390
1391 if (!PyUnicode_Check(unicode)) {
1392 PyErr_BadArgument();
1393 goto onError;
1394 }
1395
1396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001398
1399 /* Decode via the codec registry */
1400 v = PyCodec_Decode(unicode, encoding, errors);
1401 if (v == NULL)
1402 goto onError;
1403 return v;
1404
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001406 return NULL;
1407}
1408
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 if (!PyUnicode_Check(v)) {
1428 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001429 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001430 Py_TYPE(v)->tp_name);
1431 Py_DECREF(v);
1432 goto onError;
1433 }
1434 return v;
1435
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001437 return NULL;
1438}
1439
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 Py_ssize_t size,
1442 const char *encoding,
1443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 unicode = PyUnicode_FromUnicode(s, size);
1448 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1451 Py_DECREF(unicode);
1452 return v;
1453}
1454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 return v;
1474
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001476 return NULL;
1477}
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1480{
Victor Stinner313a1202010-06-11 23:56:51 +00001481 if (Py_FileSystemDefaultEncoding) {
1482#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1483 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1484 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL);
1487#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001488 return PyUnicode_AsEncodedString(unicode,
1489 Py_FileSystemDefaultEncoding,
1490 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001491 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001492 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1493 PyUnicode_GET_SIZE(unicode),
1494 "surrogateescape");
1495}
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1498 const char *encoding,
1499 const char *errors)
1500{
1501 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001502 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001503
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 }
Fred Drakee4315f52000-05-09 19:53:39 +00001508
Tim Petersced69f82003-09-16 20:30:58 +00001509 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001511
1512 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001513 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1514 if (strcmp(lower, "utf-8") == 0)
1515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 errors);
1518 else if ((strcmp(lower, "latin-1") == 0) ||
1519 (strcmp(lower, "iso-8859-1") == 0))
1520 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1521 PyUnicode_GET_SIZE(unicode),
1522 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001523#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001524 else if (strcmp(lower, "mbcs") == 0)
1525 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1526 PyUnicode_GET_SIZE(unicode),
1527 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001528#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001529 else if (strcmp(lower, "ascii") == 0)
1530 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1531 PyUnicode_GET_SIZE(unicode),
1532 errors);
1533 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001534 /* During bootstrap, we may need to find the encodings
1535 package, to load the file system encoding, and require the
1536 file system encoding in order to load the encodings
1537 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001538
Victor Stinner59e62db2010-05-15 13:14:32 +00001539 Break out of this dependency by assuming that the path to
1540 the encodings module is ASCII-only. XXX could try wcstombs
1541 instead, if the file system encoding is the locale's
1542 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001543 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001544 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1545 !PyThreadState_GET()->interp->codecs_initialized)
1546 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1547 PyUnicode_GET_SIZE(unicode),
1548 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Encode via the codec registry */
1551 v = PyCodec_Encode(unicode, encoding, errors);
1552 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001553 return NULL;
1554
1555 /* The normal path */
1556 if (PyBytes_Check(v))
1557 return v;
1558
1559 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560 if (PyByteArray_Check(v)) {
1561 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001562 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 PyOS_snprintf(msg, sizeof(msg),
1564 "encoder %s returned buffer instead of bytes",
1565 encoding);
1566 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001567 Py_DECREF(v);
1568 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001569 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001570
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001571 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1572 Py_DECREF(v);
1573 return b;
1574 }
1575
1576 PyErr_Format(PyExc_TypeError,
1577 "encoder did not return a bytes object (type=%.400s)",
1578 Py_TYPE(v)->tp_name);
1579 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001580 return NULL;
1581}
1582
1583PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1584 const char *encoding,
1585 const char *errors)
1586{
1587 PyObject *v;
1588
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 goto onError;
1592 }
1593
1594 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001596
1597 /* Encode via the codec registry */
1598 v = PyCodec_Encode(unicode, encoding, errors);
1599 if (v == NULL)
1600 goto onError;
1601 if (!PyUnicode_Check(v)) {
1602 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001603 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001604 Py_TYPE(v)->tp_name);
1605 Py_DECREF(v);
1606 goto onError;
1607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001609
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 return NULL;
1612}
1613
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001614PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001616{
1617 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001618 if (v)
1619 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001620 if (errors != NULL)
1621 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001622 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001623 PyUnicode_GET_SIZE(unicode),
1624 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001625 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001626 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001627 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001628 return v;
1629}
1630
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001631PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001632PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001633 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001634 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1635}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001636
Christian Heimes5894ba72007-11-04 11:43:14 +00001637PyObject*
1638PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1639{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001640 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1641 can be undefined. If it is case, decode using UTF-8. The following assumes
1642 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1643 bootstrapping process where the codecs aren't ready yet.
1644 */
1645 if (Py_FileSystemDefaultEncoding) {
1646#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001647 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001648 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001649 }
1650#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001651 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001652 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001653 }
1654#endif
1655 return PyUnicode_Decode(s, size,
1656 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001657 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001658 }
1659 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001660 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001661 }
1662}
1663
Martin v. Löwis011e8422009-05-05 04:43:17 +00001664/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001665 system encoding. The addr param must be a PyObject**.
1666 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001667
1668int
1669PyUnicode_FSConverter(PyObject* arg, void* addr)
1670{
1671 PyObject *output = NULL;
1672 Py_ssize_t size;
1673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001674 if (arg == NULL) {
1675 Py_DECREF(*(PyObject**)addr);
1676 return 1;
1677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001679 output = arg;
1680 Py_INCREF(output);
1681 }
1682 else {
1683 arg = PyUnicode_FromObject(arg);
1684 if (!arg)
1685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001687 Py_DECREF(arg);
1688 if (!output)
1689 return 0;
1690 if (!PyBytes_Check(output)) {
1691 Py_DECREF(output);
1692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1693 return 0;
1694 }
1695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001696 size = PyBytes_GET_SIZE(output);
1697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001698 if (size != strlen(data)) {
1699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1700 Py_DECREF(output);
1701 return 0;
1702 }
1703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001705}
1706
1707
Martin v. Löwis5b222132007-06-10 09:51:05 +00001708char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001709_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001710{
Christian Heimesf3863112007-11-22 07:46:41 +00001711 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001712 if (!PyUnicode_Check(unicode)) {
1713 PyErr_BadArgument();
1714 return NULL;
1715 }
Christian Heimesf3863112007-11-22 07:46:41 +00001716 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1717 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001718 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001719 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001720 *psize = PyBytes_GET_SIZE(bytes);
1721 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001722}
1723
1724char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001725_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001726{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001727 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001728}
1729
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1731{
1732 if (!PyUnicode_Check(unicode)) {
1733 PyErr_BadArgument();
1734 goto onError;
1735 }
1736 return PyUnicode_AS_UNICODE(unicode);
1737
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return NULL;
1740}
1741
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
1744 if (!PyUnicode_Check(unicode)) {
1745 PyErr_BadArgument();
1746 goto onError;
1747 }
1748 return PyUnicode_GET_SIZE(unicode);
1749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return -1;
1752}
1753
Thomas Wouters78890102000-07-22 19:25:51 +00001754const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001755{
1756 return unicode_default_encoding;
1757}
1758
1759int PyUnicode_SetDefaultEncoding(const char *encoding)
1760{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001761 if (strcmp(encoding, unicode_default_encoding) != 0) {
1762 PyErr_Format(PyExc_ValueError,
1763 "Can only set default encoding to %s",
1764 unicode_default_encoding);
1765 return -1;
1766 }
Fred Drakee4315f52000-05-09 19:53:39 +00001767 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001768}
1769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770/* error handling callback helper:
1771 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001772 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 and adjust various state variables.
1774 return 0 on success, -1 on error
1775*/
1776
1777static
1778int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 const char *encoding, const char *reason,
1780 const char **input, const char **inend, Py_ssize_t *startinpos,
1781 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1782 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001784 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785
1786 PyObject *restuple = NULL;
1787 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001789 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790 Py_ssize_t requiredsize;
1791 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001793 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 int res = -1;
1796
1797 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 *errorHandler = PyCodec_LookupError(errors);
1799 if (*errorHandler == NULL)
1800 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 }
1802
1803 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1806 if (*exceptionObject == NULL)
1807 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 }
1809 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001810 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1811 goto onError;
1812 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1813 goto onError;
1814 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1815 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 }
1817
1818 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1819 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001822 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 }
1825 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001827
1828 /* Copy back the bytes variables, which might have been modified by the
1829 callback */
1830 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1831 if (!inputobj)
1832 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001833 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001835 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001836 *input = PyBytes_AS_STRING(inputobj);
1837 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001838 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001839 /* we can DECREF safely, as the exception has another reference,
1840 so the object won't go away. */
1841 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001844 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001845 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1847 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849
1850 /* need more space? (at least enough for what we
1851 have+the replacement+the rest of the string (starting
1852 at the new input position), so we won't have to check space
1853 when there are no errors in the rest of the string) */
1854 repptr = PyUnicode_AS_UNICODE(repunicode);
1855 repsize = PyUnicode_GET_SIZE(repunicode);
1856 requiredsize = *outpos + repsize + insize-newpos;
1857 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001858 if (requiredsize<2*outsize)
1859 requiredsize = 2*outsize;
1860 if (_PyUnicode_Resize(output, requiredsize) < 0)
1861 goto onError;
1862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 }
1864 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001865 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 Py_UNICODE_COPY(*outptr, repptr, repsize);
1867 *outptr += repsize;
1868 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 /* we made it! */
1871 res = 0;
1872
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 Py_XDECREF(restuple);
1875 return res;
1876}
1877
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878/* --- UTF-7 Codec -------------------------------------------------------- */
1879
Antoine Pitrou244651a2009-05-04 18:56:13 +00001880/* See RFC2152 for details. We encode conservatively and decode liberally. */
1881
1882/* Three simple macros defining base-64. */
1883
1884/* Is c a base-64 character? */
1885
1886#define IS_BASE64(c) \
1887 (((c) >= 'A' && (c) <= 'Z') || \
1888 ((c) >= 'a' && (c) <= 'z') || \
1889 ((c) >= '0' && (c) <= '9') || \
1890 (c) == '+' || (c) == '/')
1891
1892/* given that c is a base-64 character, what is its base-64 value? */
1893
1894#define FROM_BASE64(c) \
1895 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1896 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1897 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1898 (c) == '+' ? 62 : 63)
1899
1900/* What is the base-64 character of the bottom 6 bits of n? */
1901
1902#define TO_BASE64(n) \
1903 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1904
1905/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1906 * decoded as itself. We are permissive on decoding; the only ASCII
1907 * byte not decoding to itself is the + which begins a base64
1908 * string. */
1909
1910#define DECODE_DIRECT(c) \
1911 ((c) <= 127 && (c) != '+')
1912
1913/* The UTF-7 encoder treats ASCII characters differently according to
1914 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1915 * the above). See RFC2152. This array identifies these different
1916 * sets:
1917 * 0 : "Set D"
1918 * alphanumeric and '(),-./:?
1919 * 1 : "Set O"
1920 * !"#$%&*;<=>@[]^_`{|}
1921 * 2 : "whitespace"
1922 * ht nl cr sp
1923 * 3 : special (must be base64 encoded)
1924 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1925 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926
Tim Petersced69f82003-09-16 20:30:58 +00001927static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001928char utf7_category[128] = {
1929/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1930 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1931/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1932 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1933/* sp ! " # $ % & ' ( ) * + , - . / */
1934 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1935/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1937/* @ A B C D E F G H I J K L M N O */
1938 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1939/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1941/* ` a b c d e f g h i j k l m n o */
1942 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943/* p q r s t u v w x y z { | } ~ del */
1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945};
1946
Antoine Pitrou244651a2009-05-04 18:56:13 +00001947/* ENCODE_DIRECT: this character should be encoded as itself. The
1948 * answer depends on whether we are encoding set O as itself, and also
1949 * on whether we are encoding whitespace as itself. RFC2152 makes it
1950 * clear that the answers to these questions vary between
1951 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001952
Antoine Pitrou244651a2009-05-04 18:56:13 +00001953#define ENCODE_DIRECT(c, directO, directWS) \
1954 ((c) < 128 && (c) > 0 && \
1955 ((utf7_category[(c)] == 0) || \
1956 (directWS && (utf7_category[(c)] == 2)) || \
1957 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001958
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 Py_ssize_t size,
1961 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001963 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1964}
1965
Antoine Pitrou244651a2009-05-04 18:56:13 +00001966/* The decoder. The only state we preserve is our read position,
1967 * i.e. how many characters we have consumed. So if we end in the
1968 * middle of a shift sequence we have to back off the read position
1969 * and the output to the beginning of the sequence, otherwise we lose
1970 * all the shift state (seen bits, number of bits seen, high
1971 * surrogate). */
1972
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001973PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 Py_ssize_t size,
1975 const char *errors,
1976 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001977{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001979 Py_ssize_t startinpos;
1980 Py_ssize_t endinpos;
1981 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001982 const char *e;
1983 PyUnicodeObject *unicode;
1984 Py_UNICODE *p;
1985 const char *errmsg = "";
1986 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001987 Py_UNICODE *shiftOutStart;
1988 unsigned int base64bits = 0;
1989 unsigned long base64buffer = 0;
1990 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001991 PyObject *errorHandler = NULL;
1992 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993
1994 unicode = _PyUnicode_New(size);
1995 if (!unicode)
1996 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001997 if (size == 0) {
1998 if (consumed)
1999 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002000 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002002
2003 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002004 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005 e = s + size;
2006
2007 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002010 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012 if (inShift) { /* in a base-64 section */
2013 if (IS_BASE64(ch)) { /* consume a base-64 character */
2014 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2015 base64bits += 6;
2016 s++;
2017 if (base64bits >= 16) {
2018 /* we have enough bits for a UTF-16 value */
2019 Py_UNICODE outCh = (Py_UNICODE)
2020 (base64buffer >> (base64bits-16));
2021 base64bits -= 16;
2022 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2023 if (surrogate) {
2024 /* expecting a second surrogate */
2025 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2026#ifdef Py_UNICODE_WIDE
2027 *p++ = (((surrogate & 0x3FF)<<10)
2028 | (outCh & 0x3FF)) + 0x10000;
2029#else
2030 *p++ = surrogate;
2031 *p++ = outCh;
2032#endif
2033 surrogate = 0;
2034 }
2035 else {
2036 surrogate = 0;
2037 errmsg = "second surrogate missing";
2038 goto utf7Error;
2039 }
2040 }
2041 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2042 /* first surrogate */
2043 surrogate = outCh;
2044 }
2045 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2046 errmsg = "unexpected second surrogate";
2047 goto utf7Error;
2048 }
2049 else {
2050 *p++ = outCh;
2051 }
2052 }
2053 }
2054 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 inShift = 0;
2056 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002057 if (surrogate) {
2058 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002059 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 if (base64bits > 0) { /* left-over bits */
2062 if (base64bits >= 6) {
2063 /* We've seen at least one base-64 character */
2064 errmsg = "partial character in shift sequence";
2065 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067 else {
2068 /* Some bits remain; they should be zero */
2069 if (base64buffer != 0) {
2070 errmsg = "non-zero padding bits in shift sequence";
2071 goto utf7Error;
2072 }
2073 }
2074 }
2075 if (ch != '-') {
2076 /* '-' is absorbed; other terminating
2077 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 *p++ = ch;
2079 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080 }
2081 }
2082 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002084 s++; /* consume '+' */
2085 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002086 s++;
2087 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002088 }
2089 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002091 shiftOutStart = p;
2092 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002093 }
2094 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002096 *p++ = ch;
2097 s++;
2098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 else {
2100 startinpos = s-starts;
2101 s++;
2102 errmsg = "unexpected special character";
2103 goto utf7Error;
2104 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002105 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002106utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 outpos = p-PyUnicode_AS_UNICODE(unicode);
2108 endinpos = s-starts;
2109 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 errors, &errorHandler,
2111 "utf7", errmsg,
2112 &starts, &e, &startinpos, &endinpos, &exc, &s,
2113 &unicode, &outpos, &p))
2114 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 }
2116
Antoine Pitrou244651a2009-05-04 18:56:13 +00002117 /* end of string */
2118
2119 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2120 /* if we're in an inconsistent state, that's an error */
2121 if (surrogate ||
2122 (base64bits >= 6) ||
2123 (base64bits > 0 && base64buffer != 0)) {
2124 outpos = p-PyUnicode_AS_UNICODE(unicode);
2125 endinpos = size;
2126 if (unicode_decode_call_errorhandler(
2127 errors, &errorHandler,
2128 "utf7", "unterminated shift sequence",
2129 &starts, &e, &startinpos, &endinpos, &exc, &s,
2130 &unicode, &outpos, &p))
2131 goto onError;
2132 if (s < e)
2133 goto restart;
2134 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136
2137 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002138 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 if (inShift) {
2140 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002141 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142 }
2143 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002144 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002145 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002146 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002148 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149 goto onError;
2150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 Py_XDECREF(errorHandler);
2152 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002153 return (PyObject *)unicode;
2154
Benjamin Peterson29060642009-01-31 22:14:21 +00002155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 Py_XDECREF(errorHandler);
2157 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002158 Py_DECREF(unicode);
2159 return NULL;
2160}
2161
2162
2163PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002164 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 int base64SetO,
2166 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002167 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002169 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002170 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002171 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002172 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002174 unsigned int base64bits = 0;
2175 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002176 char * out;
2177 char * start;
2178
2179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002182 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002183 return PyErr_NoMemory();
2184
Antoine Pitrou244651a2009-05-04 18:56:13 +00002185 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 if (v == NULL)
2187 return NULL;
2188
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002189 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190 for (;i < size; ++i) {
2191 Py_UNICODE ch = s[i];
2192
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 if (inShift) {
2194 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2195 /* shifting out */
2196 if (base64bits) { /* output remaining bits */
2197 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2198 base64buffer = 0;
2199 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 }
2201 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202 /* Characters not in the BASE64 set implicitly unshift the sequence
2203 so no '-' is required, except if the character is itself a '-' */
2204 if (IS_BASE64(ch) || ch == '-') {
2205 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207 *out++ = (char) ch;
2208 }
2209 else {
2210 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002211 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002212 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 else { /* not in a shift sequence */
2214 if (ch == '+') {
2215 *out++ = '+';
2216 *out++ = '-';
2217 }
2218 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2219 *out++ = (char) ch;
2220 }
2221 else {
2222 *out++ = '+';
2223 inShift = 1;
2224 goto encode_char;
2225 }
2226 }
2227 continue;
2228encode_char:
2229#ifdef Py_UNICODE_WIDE
2230 if (ch >= 0x10000) {
2231 /* code first surrogate */
2232 base64bits += 16;
2233 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2234 while (base64bits >= 6) {
2235 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2236 base64bits -= 6;
2237 }
2238 /* prepare second surrogate */
2239 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2240 }
2241#endif
2242 base64bits += 16;
2243 base64buffer = (base64buffer << 16) | ch;
2244 while (base64bits >= 6) {
2245 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2246 base64bits -= 6;
2247 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002248 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002249 if (base64bits)
2250 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2251 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002252 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002253 if (_PyBytes_Resize(&v, out - start) < 0)
2254 return NULL;
2255 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256}
2257
Antoine Pitrou244651a2009-05-04 18:56:13 +00002258#undef IS_BASE64
2259#undef FROM_BASE64
2260#undef TO_BASE64
2261#undef DECODE_DIRECT
2262#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264/* --- UTF-8 Codec -------------------------------------------------------- */
2265
Tim Petersced69f82003-09-16 20:30:58 +00002266static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267char utf8_code_length[256] = {
2268 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2269 illegal prefix. see RFC 2279 for details */
2270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2282 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2283 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2284 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2285 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2286};
2287
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 Py_ssize_t size,
2290 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291{
Walter Dörwald69652032004-09-07 20:24:22 +00002292 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2293}
2294
Antoine Pitrouab868312009-01-10 15:40:25 +00002295/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2296#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2297
2298/* Mask to quickly check whether a C 'long' contains a
2299 non-ASCII, UTF8-encoded char. */
2300#if (SIZEOF_LONG == 8)
2301# define ASCII_CHAR_MASK 0x8080808080808080L
2302#elif (SIZEOF_LONG == 4)
2303# define ASCII_CHAR_MASK 0x80808080L
2304#else
2305# error C 'long' size should be either 4 or 8!
2306#endif
2307
Walter Dörwald69652032004-09-07 20:24:22 +00002308PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002309 Py_ssize_t size,
2310 const char *errors,
2311 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002312{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002313 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002315 Py_ssize_t startinpos;
2316 Py_ssize_t endinpos;
2317 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002318 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 PyUnicodeObject *unicode;
2320 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002321 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002322 PyObject *errorHandler = NULL;
2323 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324
2325 /* Note: size will always be longer than the resulting Unicode
2326 character count */
2327 unicode = _PyUnicode_New(size);
2328 if (!unicode)
2329 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002330 if (size == 0) {
2331 if (consumed)
2332 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335
2336 /* Unpack UTF-8 encoded data */
2337 p = unicode->str;
2338 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002339 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340
2341 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002342 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343
2344 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002345 /* Fast path for runs of ASCII characters. Given that common UTF-8
2346 input will consist of an overwhelming majority of ASCII
2347 characters, we try to optimize for this case by checking
2348 as many characters as a C 'long' can contain.
2349 First, check if we can do an aligned read, as most CPUs have
2350 a penalty for unaligned reads.
2351 */
2352 if (!((size_t) s & LONG_PTR_MASK)) {
2353 /* Help register allocation */
2354 register const char *_s = s;
2355 register Py_UNICODE *_p = p;
2356 while (_s < aligned_end) {
2357 /* Read a whole long at a time (either 4 or 8 bytes),
2358 and do a fast unrolled copy if it only contains ASCII
2359 characters. */
2360 unsigned long data = *(unsigned long *) _s;
2361 if (data & ASCII_CHAR_MASK)
2362 break;
2363 _p[0] = (unsigned char) _s[0];
2364 _p[1] = (unsigned char) _s[1];
2365 _p[2] = (unsigned char) _s[2];
2366 _p[3] = (unsigned char) _s[3];
2367#if (SIZEOF_LONG == 8)
2368 _p[4] = (unsigned char) _s[4];
2369 _p[5] = (unsigned char) _s[5];
2370 _p[6] = (unsigned char) _s[6];
2371 _p[7] = (unsigned char) _s[7];
2372#endif
2373 _s += SIZEOF_LONG;
2374 _p += SIZEOF_LONG;
2375 }
2376 s = _s;
2377 p = _p;
2378 if (s == e)
2379 break;
2380 ch = (unsigned char)*s;
2381 }
2382 }
2383
2384 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002385 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 s++;
2387 continue;
2388 }
2389
2390 n = utf8_code_length[ch];
2391
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002392 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002393 if (consumed)
2394 break;
2395 else {
2396 errmsg = "unexpected end of data";
2397 startinpos = s-starts;
2398 endinpos = size;
2399 goto utf8Error;
2400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
2403 switch (n) {
2404
2405 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002406 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 startinpos = s-starts;
2408 endinpos = startinpos+1;
2409 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410
2411 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 startinpos = s-starts;
2414 endinpos = startinpos+1;
2415 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
2417 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002418 if ((s[1] & 0xc0) != 0x80) {
2419 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 startinpos = s-starts;
2421 endinpos = startinpos+2;
2422 goto utf8Error;
2423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002425 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 startinpos = s-starts;
2427 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002428 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 goto utf8Error;
2430 }
2431 else
2432 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433 break;
2434
2435 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002436 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 (s[2] & 0xc0) != 0x80) {
2438 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 startinpos = s-starts;
2440 endinpos = startinpos+3;
2441 goto utf8Error;
2442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002444 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002445 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 startinpos = s-starts;
2447 endinpos = startinpos+3;
2448 goto utf8Error;
2449 }
2450 else
2451 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002452 break;
2453
2454 case 4:
2455 if ((s[1] & 0xc0) != 0x80 ||
2456 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 (s[3] & 0xc0) != 0x80) {
2458 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 startinpos = s-starts;
2460 endinpos = startinpos+4;
2461 goto utf8Error;
2462 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002463 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002465 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002468 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002469 UTF-16 */
2470 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002471 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 startinpos = s-starts;
2473 endinpos = startinpos+4;
2474 goto utf8Error;
2475 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002476#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002478#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002479 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002480
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002481 /* translate from 10000..10FFFF to 0..FFFF */
2482 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002483
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002484 /* high surrogate = top 10 bits added to D800 */
2485 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002486
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002487 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002488 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002489#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 break;
2491
2492 default:
2493 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002494 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 startinpos = s-starts;
2496 endinpos = startinpos+n;
2497 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 }
2499 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002500 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002501
Benjamin Peterson29060642009-01-31 22:14:21 +00002502 utf8Error:
2503 outpos = p-PyUnicode_AS_UNICODE(unicode);
2504 if (unicode_decode_call_errorhandler(
2505 errors, &errorHandler,
2506 "utf8", errmsg,
2507 &starts, &e, &startinpos, &endinpos, &exc, &s,
2508 &unicode, &outpos, &p))
2509 goto onError;
2510 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 }
Walter Dörwald69652032004-09-07 20:24:22 +00002512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514
2515 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002516 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 goto onError;
2518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 Py_XDECREF(errorHandler);
2520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 return (PyObject *)unicode;
2522
Benjamin Peterson29060642009-01-31 22:14:21 +00002523 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 Py_XDECREF(errorHandler);
2525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 Py_DECREF(unicode);
2527 return NULL;
2528}
2529
Antoine Pitrouab868312009-01-10 15:40:25 +00002530#undef ASCII_CHAR_MASK
2531
2532
Tim Peters602f7402002-04-27 18:03:26 +00002533/* Allocation strategy: if the string is short, convert into a stack buffer
2534 and allocate exactly as much space needed at the end. Else allocate the
2535 maximum possible needed (4 result bytes per Unicode character), and return
2536 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002537*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002538PyObject *
2539PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002540 Py_ssize_t size,
2541 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542{
Tim Peters602f7402002-04-27 18:03:26 +00002543#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002544
Guido van Rossum98297ee2007-11-06 21:34:58 +00002545 Py_ssize_t i; /* index into s of next input byte */
2546 PyObject *result; /* result string object */
2547 char *p; /* next free byte in output buffer */
2548 Py_ssize_t nallocated; /* number of result bytes allocated */
2549 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002550 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002551 PyObject *errorHandler = NULL;
2552 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002553
Tim Peters602f7402002-04-27 18:03:26 +00002554 assert(s != NULL);
2555 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
Tim Peters602f7402002-04-27 18:03:26 +00002557 if (size <= MAX_SHORT_UNICHARS) {
2558 /* Write into the stack buffer; nallocated can't overflow.
2559 * At the end, we'll allocate exactly as much heap space as it
2560 * turns out we need.
2561 */
2562 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002563 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002564 p = stackbuf;
2565 }
2566 else {
2567 /* Overallocate on the heap, and give the excess back at the end. */
2568 nallocated = size * 4;
2569 if (nallocated / 4 != size) /* overflow! */
2570 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002571 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002572 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002573 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002574 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002575 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002576
Tim Peters602f7402002-04-27 18:03:26 +00002577 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002578 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002579
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002580 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002581 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002583
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002585 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002586 *p++ = (char)(0xc0 | (ch >> 6));
2587 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002588 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002589#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002590 /* Special case: check for high and low surrogate */
2591 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2592 Py_UCS4 ch2 = s[i];
2593 /* Combine the two surrogates to form a UCS4 value */
2594 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2595 i++;
2596
2597 /* Encode UCS4 Unicode ordinals */
2598 *p++ = (char)(0xf0 | (ch >> 18));
2599 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002600 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2601 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002602 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002603#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002604 Py_ssize_t newpos;
2605 PyObject *rep;
2606 Py_ssize_t repsize, k;
2607 rep = unicode_encode_call_errorhandler
2608 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2609 s, size, &exc, i-1, i, &newpos);
2610 if (!rep)
2611 goto error;
2612
2613 if (PyBytes_Check(rep))
2614 repsize = PyBytes_GET_SIZE(rep);
2615 else
2616 repsize = PyUnicode_GET_SIZE(rep);
2617
2618 if (repsize > 4) {
2619 Py_ssize_t offset;
2620
2621 if (result == NULL)
2622 offset = p - stackbuf;
2623 else
2624 offset = p - PyBytes_AS_STRING(result);
2625
2626 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2627 /* integer overflow */
2628 PyErr_NoMemory();
2629 goto error;
2630 }
2631 nallocated += repsize - 4;
2632 if (result != NULL) {
2633 if (_PyBytes_Resize(&result, nallocated) < 0)
2634 goto error;
2635 } else {
2636 result = PyBytes_FromStringAndSize(NULL, nallocated);
2637 if (result == NULL)
2638 goto error;
2639 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2640 }
2641 p = PyBytes_AS_STRING(result) + offset;
2642 }
2643
2644 if (PyBytes_Check(rep)) {
2645 char *prep = PyBytes_AS_STRING(rep);
2646 for(k = repsize; k > 0; k--)
2647 *p++ = *prep++;
2648 } else /* rep is unicode */ {
2649 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2650 Py_UNICODE c;
2651
2652 for(k=0; k<repsize; k++) {
2653 c = prep[k];
2654 if (0x80 <= c) {
2655 raise_encode_exception(&exc, "utf-8", s, size,
2656 i-1, i, "surrogates not allowed");
2657 goto error;
2658 }
2659 *p++ = (char)prep[k];
2660 }
2661 }
2662 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002663#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002664 }
Victor Stinner445a6232010-04-22 20:01:57 +00002665#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002666 } else if (ch < 0x10000) {
2667 *p++ = (char)(0xe0 | (ch >> 12));
2668 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2669 *p++ = (char)(0x80 | (ch & 0x3f));
2670 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002671 /* Encode UCS4 Unicode ordinals */
2672 *p++ = (char)(0xf0 | (ch >> 18));
2673 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2674 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2675 *p++ = (char)(0x80 | (ch & 0x3f));
2676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002678
Guido van Rossum98297ee2007-11-06 21:34:58 +00002679 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002680 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002681 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002682 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002683 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002684 }
2685 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002686 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002687 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002688 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002689 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002690 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002691 Py_XDECREF(errorHandler);
2692 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002693 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002694 error:
2695 Py_XDECREF(errorHandler);
2696 Py_XDECREF(exc);
2697 Py_XDECREF(result);
2698 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002699
Tim Peters602f7402002-04-27 18:03:26 +00002700#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2704{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2708 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002709 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 PyUnicode_GET_SIZE(unicode),
2711 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712}
2713
Walter Dörwald41980ca2007-08-16 21:55:45 +00002714/* --- UTF-32 Codec ------------------------------------------------------- */
2715
2716PyObject *
2717PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 Py_ssize_t size,
2719 const char *errors,
2720 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721{
2722 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2723}
2724
2725PyObject *
2726PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 Py_ssize_t size,
2728 const char *errors,
2729 int *byteorder,
2730 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002731{
2732 const char *starts = s;
2733 Py_ssize_t startinpos;
2734 Py_ssize_t endinpos;
2735 Py_ssize_t outpos;
2736 PyUnicodeObject *unicode;
2737 Py_UNICODE *p;
2738#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002739 int pairs = 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740#else
2741 const int pairs = 0;
2742#endif
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002743 const unsigned char *q, *e, *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002744 int bo = 0; /* assume native ordering by default */
2745 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002746 /* Offsets from q for retrieving bytes in the right order. */
2747#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2748 int iorder[] = {0, 1, 2, 3};
2749#else
2750 int iorder[] = {3, 2, 1, 0};
2751#endif
2752 PyObject *errorHandler = NULL;
2753 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002754
Walter Dörwald41980ca2007-08-16 21:55:45 +00002755 q = (unsigned char *)s;
2756 e = q + size;
2757
2758 if (byteorder)
2759 bo = *byteorder;
2760
2761 /* Check for BOM marks (U+FEFF) in the input and adjust current
2762 byte order setting accordingly. In native mode, the leading BOM
2763 mark is skipped, in all other modes, it is copied to the output
2764 stream as-is (giving a ZWNBSP character). */
2765 if (bo == 0) {
2766 if (size >= 4) {
2767 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002769#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 if (bom == 0x0000FEFF) {
2771 q += 4;
2772 bo = -1;
2773 }
2774 else if (bom == 0xFFFE0000) {
2775 q += 4;
2776 bo = 1;
2777 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 if (bom == 0x0000FEFF) {
2780 q += 4;
2781 bo = 1;
2782 }
2783 else if (bom == 0xFFFE0000) {
2784 q += 4;
2785 bo = -1;
2786 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002787#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002788 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002789 }
2790
2791 if (bo == -1) {
2792 /* force LE */
2793 iorder[0] = 0;
2794 iorder[1] = 1;
2795 iorder[2] = 2;
2796 iorder[3] = 3;
2797 }
2798 else if (bo == 1) {
2799 /* force BE */
2800 iorder[0] = 3;
2801 iorder[1] = 2;
2802 iorder[2] = 1;
2803 iorder[3] = 0;
2804 }
2805
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002806 /* On narrow builds we split characters outside the BMP into two
2807 codepoints => count how much extra space we need. */
2808#ifndef Py_UNICODE_WIDE
2809 for (qq = q; qq < e; qq += 4)
2810 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2811 pairs++;
2812#endif
2813
2814 /* This might be one to much, because of a BOM */
2815 unicode = _PyUnicode_New((size+3)/4+pairs);
2816 if (!unicode)
2817 return NULL;
2818 if (size == 0)
2819 return (PyObject *)unicode;
2820
2821 /* Unpack UTF-32 encoded data */
2822 p = unicode->str;
2823
Walter Dörwald41980ca2007-08-16 21:55:45 +00002824 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 Py_UCS4 ch;
2826 /* remaining bytes at the end? (size should be divisible by 4) */
2827 if (e-q<4) {
2828 if (consumed)
2829 break;
2830 errmsg = "truncated data";
2831 startinpos = ((const char *)q)-starts;
2832 endinpos = ((const char *)e)-starts;
2833 goto utf32Error;
2834 /* The remaining input chars are ignored if the callback
2835 chooses to skip the input */
2836 }
2837 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2838 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002839
Benjamin Peterson29060642009-01-31 22:14:21 +00002840 if (ch >= 0x110000)
2841 {
2842 errmsg = "codepoint not in range(0x110000)";
2843 startinpos = ((const char *)q)-starts;
2844 endinpos = startinpos+4;
2845 goto utf32Error;
2846 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002847#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 if (ch >= 0x10000)
2849 {
2850 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2851 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2852 }
2853 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002854#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 *p++ = ch;
2856 q += 4;
2857 continue;
2858 utf32Error:
2859 outpos = p-PyUnicode_AS_UNICODE(unicode);
2860 if (unicode_decode_call_errorhandler(
2861 errors, &errorHandler,
2862 "utf32", errmsg,
2863 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2864 &unicode, &outpos, &p))
2865 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002866 }
2867
2868 if (byteorder)
2869 *byteorder = bo;
2870
2871 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002873
2874 /* Adjust length */
2875 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2876 goto onError;
2877
2878 Py_XDECREF(errorHandler);
2879 Py_XDECREF(exc);
2880 return (PyObject *)unicode;
2881
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002883 Py_DECREF(unicode);
2884 Py_XDECREF(errorHandler);
2885 Py_XDECREF(exc);
2886 return NULL;
2887}
2888
2889PyObject *
2890PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 Py_ssize_t size,
2892 const char *errors,
2893 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002894{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002895 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002896 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002897 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002898#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002899 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002900#else
2901 const int pairs = 0;
2902#endif
2903 /* Offsets from p for storing byte pairs in the right order. */
2904#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2905 int iorder[] = {0, 1, 2, 3};
2906#else
2907 int iorder[] = {3, 2, 1, 0};
2908#endif
2909
Benjamin Peterson29060642009-01-31 22:14:21 +00002910#define STORECHAR(CH) \
2911 do { \
2912 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2913 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2914 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2915 p[iorder[0]] = (CH) & 0xff; \
2916 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002917 } while(0)
2918
2919 /* In narrow builds we can output surrogate pairs as one codepoint,
2920 so we need less space. */
2921#ifndef Py_UNICODE_WIDE
2922 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2924 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2925 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002926#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002927 nsize = (size - pairs + (byteorder == 0));
2928 bytesize = nsize * 4;
2929 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002931 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932 if (v == NULL)
2933 return NULL;
2934
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002935 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002936 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002938 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002939 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002940
2941 if (byteorder == -1) {
2942 /* force LE */
2943 iorder[0] = 0;
2944 iorder[1] = 1;
2945 iorder[2] = 2;
2946 iorder[3] = 3;
2947 }
2948 else if (byteorder == 1) {
2949 /* force BE */
2950 iorder[0] = 3;
2951 iorder[1] = 2;
2952 iorder[2] = 1;
2953 iorder[3] = 0;
2954 }
2955
2956 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002957 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002958#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002959 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2960 Py_UCS4 ch2 = *s;
2961 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2962 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2963 s++;
2964 size--;
2965 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002966 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002967#endif
2968 STORECHAR(ch);
2969 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002970
2971 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002972 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002973#undef STORECHAR
2974}
2975
2976PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2977{
2978 if (!PyUnicode_Check(unicode)) {
2979 PyErr_BadArgument();
2980 return NULL;
2981 }
2982 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 PyUnicode_GET_SIZE(unicode),
2984 NULL,
2985 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002986}
2987
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988/* --- UTF-16 Codec ------------------------------------------------------- */
2989
Tim Peters772747b2001-08-09 22:21:55 +00002990PyObject *
2991PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 Py_ssize_t size,
2993 const char *errors,
2994 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995{
Walter Dörwald69652032004-09-07 20:24:22 +00002996 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2997}
2998
Antoine Pitrouab868312009-01-10 15:40:25 +00002999/* Two masks for fast checking of whether a C 'long' may contain
3000 UTF16-encoded surrogate characters. This is an efficient heuristic,
3001 assuming that non-surrogate characters with a code point >= 0x8000 are
3002 rare in most input.
3003 FAST_CHAR_MASK is used when the input is in native byte ordering,
3004 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003005*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003006#if (SIZEOF_LONG == 8)
3007# define FAST_CHAR_MASK 0x8000800080008000L
3008# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3009#elif (SIZEOF_LONG == 4)
3010# define FAST_CHAR_MASK 0x80008000L
3011# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3012#else
3013# error C 'long' size should be either 4 or 8!
3014#endif
3015
Walter Dörwald69652032004-09-07 20:24:22 +00003016PyObject *
3017PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 Py_ssize_t size,
3019 const char *errors,
3020 int *byteorder,
3021 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003022{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003024 Py_ssize_t startinpos;
3025 Py_ssize_t endinpos;
3026 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 PyUnicodeObject *unicode;
3028 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003029 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003030 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003031 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003032 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003033 /* Offsets from q for retrieving byte pairs in the right order. */
3034#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3035 int ihi = 1, ilo = 0;
3036#else
3037 int ihi = 0, ilo = 1;
3038#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 PyObject *errorHandler = NULL;
3040 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041
3042 /* Note: size will always be longer than the resulting Unicode
3043 character count */
3044 unicode = _PyUnicode_New(size);
3045 if (!unicode)
3046 return NULL;
3047 if (size == 0)
3048 return (PyObject *)unicode;
3049
3050 /* Unpack UTF-16 encoded data */
3051 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003052 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003053 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054
3055 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003056 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003058 /* Check for BOM marks (U+FEFF) in the input and adjust current
3059 byte order setting accordingly. In native mode, the leading BOM
3060 mark is skipped, in all other modes, it is copied to the output
3061 stream as-is (giving a ZWNBSP character). */
3062 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003063 if (size >= 2) {
3064 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003065#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 if (bom == 0xFEFF) {
3067 q += 2;
3068 bo = -1;
3069 }
3070 else if (bom == 0xFFFE) {
3071 q += 2;
3072 bo = 1;
3073 }
Tim Petersced69f82003-09-16 20:30:58 +00003074#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 if (bom == 0xFEFF) {
3076 q += 2;
3077 bo = 1;
3078 }
3079 else if (bom == 0xFFFE) {
3080 q += 2;
3081 bo = -1;
3082 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003083#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086
Tim Peters772747b2001-08-09 22:21:55 +00003087 if (bo == -1) {
3088 /* force LE */
3089 ihi = 1;
3090 ilo = 0;
3091 }
3092 else if (bo == 1) {
3093 /* force BE */
3094 ihi = 0;
3095 ilo = 1;
3096 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3098 native_ordering = ilo < ihi;
3099#else
3100 native_ordering = ilo > ihi;
3101#endif
Tim Peters772747b2001-08-09 22:21:55 +00003102
Antoine Pitrouab868312009-01-10 15:40:25 +00003103 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003104 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003106 /* First check for possible aligned read of a C 'long'. Unaligned
3107 reads are more expensive, better to defer to another iteration. */
3108 if (!((size_t) q & LONG_PTR_MASK)) {
3109 /* Fast path for runs of non-surrogate chars. */
3110 register const unsigned char *_q = q;
3111 Py_UNICODE *_p = p;
3112 if (native_ordering) {
3113 /* Native ordering is simple: as long as the input cannot
3114 possibly contain a surrogate char, do an unrolled copy
3115 of several 16-bit code points to the target object.
3116 The non-surrogate check is done on several input bytes
3117 at a time (as many as a C 'long' can contain). */
3118 while (_q < aligned_end) {
3119 unsigned long data = * (unsigned long *) _q;
3120 if (data & FAST_CHAR_MASK)
3121 break;
3122 _p[0] = ((unsigned short *) _q)[0];
3123 _p[1] = ((unsigned short *) _q)[1];
3124#if (SIZEOF_LONG == 8)
3125 _p[2] = ((unsigned short *) _q)[2];
3126 _p[3] = ((unsigned short *) _q)[3];
3127#endif
3128 _q += SIZEOF_LONG;
3129 _p += SIZEOF_LONG / 2;
3130 }
3131 }
3132 else {
3133 /* Byteswapped ordering is similar, but we must decompose
3134 the copy bytewise, and take care of zero'ing out the
3135 upper bytes if the target object is in 32-bit units
3136 (that is, in UCS-4 builds). */
3137 while (_q < aligned_end) {
3138 unsigned long data = * (unsigned long *) _q;
3139 if (data & SWAPPED_FAST_CHAR_MASK)
3140 break;
3141 /* Zero upper bytes in UCS-4 builds */
3142#if (Py_UNICODE_SIZE > 2)
3143 _p[0] = 0;
3144 _p[1] = 0;
3145#if (SIZEOF_LONG == 8)
3146 _p[2] = 0;
3147 _p[3] = 0;
3148#endif
3149#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003150 /* Issue #4916; UCS-4 builds on big endian machines must
3151 fill the two last bytes of each 4-byte unit. */
3152#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3153# define OFF 2
3154#else
3155# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003156#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003157 ((unsigned char *) _p)[OFF + 1] = _q[0];
3158 ((unsigned char *) _p)[OFF + 0] = _q[1];
3159 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3160 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3161#if (SIZEOF_LONG == 8)
3162 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3163 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3164 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3165 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3166#endif
3167#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003168 _q += SIZEOF_LONG;
3169 _p += SIZEOF_LONG / 2;
3170 }
3171 }
3172 p = _p;
3173 q = _q;
3174 if (q >= e)
3175 break;
3176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003177 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178
Benjamin Peterson14339b62009-01-31 16:36:08 +00003179 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003180
3181 if (ch < 0xD800 || ch > 0xDFFF) {
3182 *p++ = ch;
3183 continue;
3184 }
3185
3186 /* UTF-16 code pair: */
3187 if (q > e) {
3188 errmsg = "unexpected end of data";
3189 startinpos = (((const char *)q) - 2) - starts;
3190 endinpos = ((const char *)e) + 1 - starts;
3191 goto utf16Error;
3192 }
3193 if (0xD800 <= ch && ch <= 0xDBFF) {
3194 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3195 q += 2;
3196 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003197#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003198 *p++ = ch;
3199 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003200#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003202#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 continue;
3204 }
3205 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003206 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003207 startinpos = (((const char *)q)-4)-starts;
3208 endinpos = startinpos+2;
3209 goto utf16Error;
3210 }
3211
Benjamin Peterson14339b62009-01-31 16:36:08 +00003212 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 errmsg = "illegal encoding";
3214 startinpos = (((const char *)q)-2)-starts;
3215 endinpos = startinpos+2;
3216 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003217
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 utf16Error:
3219 outpos = p - PyUnicode_AS_UNICODE(unicode);
3220 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003221 errors,
3222 &errorHandler,
3223 "utf16", errmsg,
3224 &starts,
3225 (const char **)&e,
3226 &startinpos,
3227 &endinpos,
3228 &exc,
3229 (const char **)&q,
3230 &unicode,
3231 &outpos,
3232 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003235 /* remaining byte at the end? (size should be even) */
3236 if (e == q) {
3237 if (!consumed) {
3238 errmsg = "truncated data";
3239 startinpos = ((const char *)q) - starts;
3240 endinpos = ((const char *)e) + 1 - starts;
3241 outpos = p - PyUnicode_AS_UNICODE(unicode);
3242 if (unicode_decode_call_errorhandler(
3243 errors,
3244 &errorHandler,
3245 "utf16", errmsg,
3246 &starts,
3247 (const char **)&e,
3248 &startinpos,
3249 &endinpos,
3250 &exc,
3251 (const char **)&q,
3252 &unicode,
3253 &outpos,
3254 &p))
3255 goto onError;
3256 /* The remaining input chars are ignored if the callback
3257 chooses to skip the input */
3258 }
3259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260
3261 if (byteorder)
3262 *byteorder = bo;
3263
Walter Dörwald69652032004-09-07 20:24:22 +00003264 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003266
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003268 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 goto onError;
3270
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return (PyObject *)unicode;
3274
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 Py_XDECREF(errorHandler);
3278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 return NULL;
3280}
3281
Antoine Pitrouab868312009-01-10 15:40:25 +00003282#undef FAST_CHAR_MASK
3283#undef SWAPPED_FAST_CHAR_MASK
3284
Tim Peters772747b2001-08-09 22:21:55 +00003285PyObject *
3286PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 Py_ssize_t size,
3288 const char *errors,
3289 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003291 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003292 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003293 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003294#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003295 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003296#else
3297 const int pairs = 0;
3298#endif
Tim Peters772747b2001-08-09 22:21:55 +00003299 /* Offsets from p for storing byte pairs in the right order. */
3300#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3301 int ihi = 1, ilo = 0;
3302#else
3303 int ihi = 0, ilo = 1;
3304#endif
3305
Benjamin Peterson29060642009-01-31 22:14:21 +00003306#define STORECHAR(CH) \
3307 do { \
3308 p[ihi] = ((CH) >> 8) & 0xff; \
3309 p[ilo] = (CH) & 0xff; \
3310 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003311 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003313#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003314 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003315 if (s[i] >= 0x10000)
3316 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003317#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003318 /* 2 * (size + pairs + (byteorder == 0)) */
3319 if (size > PY_SSIZE_T_MAX ||
3320 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003322 nsize = size + pairs + (byteorder == 0);
3323 bytesize = nsize * 2;
3324 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003326 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 if (v == NULL)
3328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003330 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003333 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003334 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003335
3336 if (byteorder == -1) {
3337 /* force LE */
3338 ihi = 1;
3339 ilo = 0;
3340 }
3341 else if (byteorder == 1) {
3342 /* force BE */
3343 ihi = 0;
3344 ilo = 1;
3345 }
3346
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003347 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 Py_UNICODE ch = *s++;
3349 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003350#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003351 if (ch >= 0x10000) {
3352 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3353 ch = 0xD800 | ((ch-0x10000) >> 10);
3354 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003355#endif
Tim Peters772747b2001-08-09 22:21:55 +00003356 STORECHAR(ch);
3357 if (ch2)
3358 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003359 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003360
3361 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003362 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003363#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364}
3365
3366PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3367{
3368 if (!PyUnicode_Check(unicode)) {
3369 PyErr_BadArgument();
3370 return NULL;
3371 }
3372 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003373 PyUnicode_GET_SIZE(unicode),
3374 NULL,
3375 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376}
3377
3378/* --- Unicode Escape Codec ----------------------------------------------- */
3379
Fredrik Lundh06d12682001-01-24 07:59:11 +00003380static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 Py_ssize_t size,
3384 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003387 Py_ssize_t startinpos;
3388 Py_ssize_t endinpos;
3389 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003394 char* message;
3395 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 PyObject *errorHandler = NULL;
3397 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003398
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 /* Escaped strings will always be longer than the resulting
3400 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 length after conversion to the true value.
3402 (but if the error callback returns a long replacement string
3403 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 v = _PyUnicode_New(size);
3405 if (v == NULL)
3406 goto onError;
3407 if (size == 0)
3408 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003412
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413 while (s < end) {
3414 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003415 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417
3418 /* Non-escape characters are interpreted as Unicode ordinals */
3419 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003420 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 continue;
3422 }
3423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 /* \ - Escapes */
3426 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003427 c = *s++;
3428 if (s > end)
3429 c = '\0'; /* Invalid after \ */
3430 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 case '\n': break;
3434 case '\\': *p++ = '\\'; break;
3435 case '\'': *p++ = '\''; break;
3436 case '\"': *p++ = '\"'; break;
3437 case 'b': *p++ = '\b'; break;
3438 case 'f': *p++ = '\014'; break; /* FF */
3439 case 't': *p++ = '\t'; break;
3440 case 'n': *p++ = '\n'; break;
3441 case 'r': *p++ = '\r'; break;
3442 case 'v': *p++ = '\013'; break; /* VT */
3443 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3444
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 case '0': case '1': case '2': case '3':
3447 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003448 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003449 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003450 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003451 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003452 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003454 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 break;
3456
Benjamin Peterson29060642009-01-31 22:14:21 +00003457 /* hex escapes */
3458 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003460 digits = 2;
3461 message = "truncated \\xXX escape";
3462 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003466 digits = 4;
3467 message = "truncated \\uXXXX escape";
3468 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003471 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003472 digits = 8;
3473 message = "truncated \\UXXXXXXXX escape";
3474 hexescape:
3475 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 outpos = p-PyUnicode_AS_UNICODE(v);
3477 if (s+digits>end) {
3478 endinpos = size;
3479 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 errors, &errorHandler,
3481 "unicodeescape", "end of string in escape sequence",
3482 &starts, &end, &startinpos, &endinpos, &exc, &s,
3483 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 goto onError;
3485 goto nextByte;
3486 }
3487 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003488 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003489 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 endinpos = (s+i+1)-starts;
3491 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 errors, &errorHandler,
3493 "unicodeescape", message,
3494 &starts, &end, &startinpos, &endinpos, &exc, &s,
3495 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003498 }
3499 chr = (chr<<4) & ~0xF;
3500 if (c >= '0' && c <= '9')
3501 chr += c - '0';
3502 else if (c >= 'a' && c <= 'f')
3503 chr += 10 + c - 'a';
3504 else
3505 chr += 10 + c - 'A';
3506 }
3507 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003508 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 /* _decoding_error will have already written into the
3510 target buffer. */
3511 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003512 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003513 /* when we get here, chr is a 32-bit unicode character */
3514 if (chr <= 0xffff)
3515 /* UCS-2 character */
3516 *p++ = (Py_UNICODE) chr;
3517 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003518 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003519 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003520#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003521 *p++ = chr;
3522#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003523 chr -= 0x10000L;
3524 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003525 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003526#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003527 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 endinpos = s-starts;
3529 outpos = p-PyUnicode_AS_UNICODE(v);
3530 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 errors, &errorHandler,
3532 "unicodeescape", "illegal Unicode character",
3533 &starts, &end, &startinpos, &endinpos, &exc, &s,
3534 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003535 goto onError;
3536 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003537 break;
3538
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003540 case 'N':
3541 message = "malformed \\N character escape";
3542 if (ucnhash_CAPI == NULL) {
3543 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003544 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003545 if (ucnhash_CAPI == NULL)
3546 goto ucnhashError;
3547 }
3548 if (*s == '{') {
3549 const char *start = s+1;
3550 /* look for the closing brace */
3551 while (*s != '}' && s < end)
3552 s++;
3553 if (s > start && s < end && *s == '}') {
3554 /* found a name. look it up in the unicode database */
3555 message = "unknown Unicode character name";
3556 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003557 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003558 goto store;
3559 }
3560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 endinpos = s-starts;
3562 outpos = p-PyUnicode_AS_UNICODE(v);
3563 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 errors, &errorHandler,
3565 "unicodeescape", message,
3566 &starts, &end, &startinpos, &endinpos, &exc, &s,
3567 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003568 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003569 break;
3570
3571 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003572 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 message = "\\ at end of string";
3574 s--;
3575 endinpos = s-starts;
3576 outpos = p-PyUnicode_AS_UNICODE(v);
3577 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003578 errors, &errorHandler,
3579 "unicodeescape", message,
3580 &starts, &end, &startinpos, &endinpos, &exc, &s,
3581 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003582 goto onError;
3583 }
3584 else {
3585 *p++ = '\\';
3586 *p++ = (unsigned char)s[-1];
3587 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003588 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003593 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003595 Py_XDECREF(errorHandler);
3596 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003598
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003600 PyErr_SetString(
3601 PyExc_UnicodeError,
3602 "\\N escapes not supported (can't load unicodedata module)"
3603 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003604 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003607 return NULL;
3608
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_XDECREF(errorHandler);
3612 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 return NULL;
3614}
3615
3616/* Return a Unicode-Escape string version of the Unicode object.
3617
3618 If quotes is true, the string is enclosed in u"" or u'' quotes as
3619 appropriate.
3620
3621*/
3622
Thomas Wouters477c8d52006-05-27 19:21:47 +00003623Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 Py_ssize_t size,
3625 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003626{
3627 /* like wcschr, but doesn't stop at NULL characters */
3628
3629 while (size-- > 0) {
3630 if (*s == ch)
3631 return s;
3632 s++;
3633 }
3634
3635 return NULL;
3636}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003637
Walter Dörwald79e913e2007-05-12 11:08:06 +00003638static const char *hexdigits = "0123456789abcdef";
3639
3640PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003643 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646#ifdef Py_UNICODE_WIDE
3647 const Py_ssize_t expandsize = 10;
3648#else
3649 const Py_ssize_t expandsize = 6;
3650#endif
3651
Thomas Wouters89f507f2006-12-13 04:49:30 +00003652 /* XXX(nnorwitz): rather than over-allocating, it would be
3653 better to choose a different scheme. Perhaps scan the
3654 first N-chars of the string and allocate based on that size.
3655 */
3656 /* Initial allocation is based on the longest-possible unichr
3657 escape.
3658
3659 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3660 unichr, so in this case it's the longest unichr escape. In
3661 narrow (UTF-16) builds this is five chars per source unichr
3662 since there are two unichrs in the surrogate pair, so in narrow
3663 (UTF-16) builds it's not the longest unichr escape.
3664
3665 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3666 so in the narrow (UTF-16) build case it's the longest unichr
3667 escape.
3668 */
3669
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003670 if (size == 0)
3671 return PyBytes_FromStringAndSize(NULL, 0);
3672
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003673 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003676 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 2
3678 + expandsize*size
3679 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 if (repr == NULL)
3681 return NULL;
3682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003683 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 while (size-- > 0) {
3686 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003687
Walter Dörwald79e913e2007-05-12 11:08:06 +00003688 /* Escape backslashes */
3689 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 *p++ = '\\';
3691 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003692 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003694
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003695#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003696 /* Map 21-bit characters to '\U00xxxxxx' */
3697 else if (ch >= 0x10000) {
3698 *p++ = '\\';
3699 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003700 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3701 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3702 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3703 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3704 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3705 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3706 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3707 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003709 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003710#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3712 else if (ch >= 0xD800 && ch < 0xDC00) {
3713 Py_UNICODE ch2;
3714 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003715
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 ch2 = *s++;
3717 size--;
3718 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3719 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3720 *p++ = '\\';
3721 *p++ = 'U';
3722 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3723 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3724 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3725 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3726 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3727 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3728 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3729 *p++ = hexdigits[ucs & 0x0000000F];
3730 continue;
3731 }
3732 /* Fall through: isolated surrogates are copied as-is */
3733 s--;
3734 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003735 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003736#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003739 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 *p++ = '\\';
3741 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003742 *p++ = hexdigits[(ch >> 12) & 0x000F];
3743 *p++ = hexdigits[(ch >> 8) & 0x000F];
3744 *p++ = hexdigits[(ch >> 4) & 0x000F];
3745 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003747
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003748 /* Map special whitespace to '\t', \n', '\r' */
3749 else if (ch == '\t') {
3750 *p++ = '\\';
3751 *p++ = 't';
3752 }
3753 else if (ch == '\n') {
3754 *p++ = '\\';
3755 *p++ = 'n';
3756 }
3757 else if (ch == '\r') {
3758 *p++ = '\\';
3759 *p++ = 'r';
3760 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003761
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003762 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003763 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003765 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003766 *p++ = hexdigits[(ch >> 4) & 0x000F];
3767 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003768 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003769
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 /* Copy everything else as-is */
3771 else
3772 *p++ = (char) ch;
3773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003775 assert(p - PyBytes_AS_STRING(repr) > 0);
3776 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3777 return NULL;
3778 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779}
3780
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003781PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003783 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 if (!PyUnicode_Check(unicode)) {
3785 PyErr_BadArgument();
3786 return NULL;
3787 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003788 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3789 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003790 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791}
3792
3793/* --- Raw Unicode Escape Codec ------------------------------------------- */
3794
3795PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 Py_ssize_t size,
3797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003800 Py_ssize_t startinpos;
3801 Py_ssize_t endinpos;
3802 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 const char *end;
3806 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 PyObject *errorHandler = NULL;
3808 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003809
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 /* Escaped strings will always be longer than the resulting
3811 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 length after conversion to the true value. (But decoding error
3813 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 v = _PyUnicode_New(size);
3815 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 end = s + size;
3821 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 unsigned char c;
3823 Py_UCS4 x;
3824 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003825 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 /* Non-escape characters are interpreted as Unicode ordinals */
3828 if (*s != '\\') {
3829 *p++ = (unsigned char)*s++;
3830 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 startinpos = s-starts;
3833
3834 /* \u-escapes are only interpreted iff the number of leading
3835 backslashes if odd */
3836 bs = s;
3837 for (;s < end;) {
3838 if (*s != '\\')
3839 break;
3840 *p++ = (unsigned char)*s++;
3841 }
3842 if (((s - bs) & 1) == 0 ||
3843 s >= end ||
3844 (*s != 'u' && *s != 'U')) {
3845 continue;
3846 }
3847 p--;
3848 count = *s=='u' ? 4 : 8;
3849 s++;
3850
3851 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3852 outpos = p-PyUnicode_AS_UNICODE(v);
3853 for (x = 0, i = 0; i < count; ++i, ++s) {
3854 c = (unsigned char)*s;
3855 if (!ISXDIGIT(c)) {
3856 endinpos = s-starts;
3857 if (unicode_decode_call_errorhandler(
3858 errors, &errorHandler,
3859 "rawunicodeescape", "truncated \\uXXXX",
3860 &starts, &end, &startinpos, &endinpos, &exc, &s,
3861 &v, &outpos, &p))
3862 goto onError;
3863 goto nextByte;
3864 }
3865 x = (x<<4) & ~0xF;
3866 if (c >= '0' && c <= '9')
3867 x += c - '0';
3868 else if (c >= 'a' && c <= 'f')
3869 x += 10 + c - 'a';
3870 else
3871 x += 10 + c - 'A';
3872 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003873 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 /* UCS-2 character */
3875 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003876 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003877 /* UCS-4 character. Either store directly, or as
3878 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003879#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003880 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003881#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 x -= 0x10000L;
3883 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3884 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003885#endif
3886 } else {
3887 endinpos = s-starts;
3888 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003889 if (unicode_decode_call_errorhandler(
3890 errors, &errorHandler,
3891 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 &starts, &end, &startinpos, &endinpos, &exc, &s,
3893 &v, &outpos, &p))
3894 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003896 nextByte:
3897 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003899 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003900 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 Py_XDECREF(errorHandler);
3902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003904
Benjamin Peterson29060642009-01-31 22:14:21 +00003905 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 Py_XDECREF(errorHandler);
3908 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 return NULL;
3910}
3911
3912PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003915 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 char *p;
3917 char *q;
3918
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003919#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003920 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003921#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003922 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003923#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003924
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003925 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003927
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003928 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 if (repr == NULL)
3930 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003931 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003932 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003934 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935 while (size-- > 0) {
3936 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003937#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 /* Map 32-bit characters to '\Uxxxxxxxx' */
3939 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003940 *p++ = '\\';
3941 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003942 *p++ = hexdigits[(ch >> 28) & 0xf];
3943 *p++ = hexdigits[(ch >> 24) & 0xf];
3944 *p++ = hexdigits[(ch >> 20) & 0xf];
3945 *p++ = hexdigits[(ch >> 16) & 0xf];
3946 *p++ = hexdigits[(ch >> 12) & 0xf];
3947 *p++ = hexdigits[(ch >> 8) & 0xf];
3948 *p++ = hexdigits[(ch >> 4) & 0xf];
3949 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003950 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003951 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003952#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3954 if (ch >= 0xD800 && ch < 0xDC00) {
3955 Py_UNICODE ch2;
3956 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003957
Benjamin Peterson29060642009-01-31 22:14:21 +00003958 ch2 = *s++;
3959 size--;
3960 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3961 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3962 *p++ = '\\';
3963 *p++ = 'U';
3964 *p++ = hexdigits[(ucs >> 28) & 0xf];
3965 *p++ = hexdigits[(ucs >> 24) & 0xf];
3966 *p++ = hexdigits[(ucs >> 20) & 0xf];
3967 *p++ = hexdigits[(ucs >> 16) & 0xf];
3968 *p++ = hexdigits[(ucs >> 12) & 0xf];
3969 *p++ = hexdigits[(ucs >> 8) & 0xf];
3970 *p++ = hexdigits[(ucs >> 4) & 0xf];
3971 *p++ = hexdigits[ucs & 0xf];
3972 continue;
3973 }
3974 /* Fall through: isolated surrogates are copied as-is */
3975 s--;
3976 size++;
3977 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003978#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 /* Map 16-bit characters to '\uxxxx' */
3980 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 *p++ = '\\';
3982 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003983 *p++ = hexdigits[(ch >> 12) & 0xf];
3984 *p++ = hexdigits[(ch >> 8) & 0xf];
3985 *p++ = hexdigits[(ch >> 4) & 0xf];
3986 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 /* Copy everything else as-is */
3989 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 *p++ = (char) ch;
3991 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003992 size = p - q;
3993
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 assert(size > 0);
3995 if (_PyBytes_Resize(&repr, size) < 0)
3996 return NULL;
3997 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998}
3999
4000PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4001{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004002 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004004 PyErr_BadArgument();
4005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004007 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4008 PyUnicode_GET_SIZE(unicode));
4009
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004010 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011}
4012
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004013/* --- Unicode Internal Codec ------------------------------------------- */
4014
4015PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 Py_ssize_t size,
4017 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004018{
4019 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004020 Py_ssize_t startinpos;
4021 Py_ssize_t endinpos;
4022 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004023 PyUnicodeObject *v;
4024 Py_UNICODE *p;
4025 const char *end;
4026 const char *reason;
4027 PyObject *errorHandler = NULL;
4028 PyObject *exc = NULL;
4029
Neal Norwitzd43069c2006-01-08 01:12:10 +00004030#ifdef Py_UNICODE_WIDE
4031 Py_UNICODE unimax = PyUnicode_GetMax();
4032#endif
4033
Thomas Wouters89f507f2006-12-13 04:49:30 +00004034 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004035 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4036 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004038 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004040 p = PyUnicode_AS_UNICODE(v);
4041 end = s + size;
4042
4043 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004044 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004045 /* We have to sanity check the raw data, otherwise doom looms for
4046 some malformed UCS-4 data. */
4047 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004048#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004049 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004050#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004051 end-s < Py_UNICODE_SIZE
4052 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004054 startinpos = s - starts;
4055 if (end-s < Py_UNICODE_SIZE) {
4056 endinpos = end-starts;
4057 reason = "truncated input";
4058 }
4059 else {
4060 endinpos = s - starts + Py_UNICODE_SIZE;
4061 reason = "illegal code point (> 0x10FFFF)";
4062 }
4063 outpos = p - PyUnicode_AS_UNICODE(v);
4064 if (unicode_decode_call_errorhandler(
4065 errors, &errorHandler,
4066 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004067 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004068 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004069 goto onError;
4070 }
4071 }
4072 else {
4073 p++;
4074 s += Py_UNICODE_SIZE;
4075 }
4076 }
4077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004078 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004079 goto onError;
4080 Py_XDECREF(errorHandler);
4081 Py_XDECREF(exc);
4082 return (PyObject *)v;
4083
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004085 Py_XDECREF(v);
4086 Py_XDECREF(errorHandler);
4087 Py_XDECREF(exc);
4088 return NULL;
4089}
4090
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091/* --- Latin-1 Codec ------------------------------------------------------ */
4092
4093PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 Py_ssize_t size,
4095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096{
4097 PyUnicodeObject *v;
4098 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004099 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004100
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004102 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 Py_UNICODE r = *(unsigned char*)s;
4104 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004105 }
4106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 v = _PyUnicode_New(size);
4108 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004113 e = s + size;
4114 /* Unrolling the copy makes it much faster by reducing the looping
4115 overhead. This is similar to what many memcpy() implementations do. */
4116 unrolled_end = e - 4;
4117 while (s < unrolled_end) {
4118 p[0] = (unsigned char) s[0];
4119 p[1] = (unsigned char) s[1];
4120 p[2] = (unsigned char) s[2];
4121 p[3] = (unsigned char) s[3];
4122 s += 4;
4123 p += 4;
4124 }
4125 while (s < e)
4126 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004128
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 Py_XDECREF(v);
4131 return NULL;
4132}
4133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134/* create or adjust a UnicodeEncodeError */
4135static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 const char *encoding,
4137 const Py_UNICODE *unicode, Py_ssize_t size,
4138 Py_ssize_t startpos, Py_ssize_t endpos,
4139 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 *exceptionObject = PyUnicodeEncodeError_Create(
4143 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 }
4145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4147 goto onError;
4148 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4149 goto onError;
4150 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4151 goto onError;
4152 return;
4153 onError:
4154 Py_DECREF(*exceptionObject);
4155 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 }
4157}
4158
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159/* raises a UnicodeEncodeError */
4160static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 const char *encoding,
4162 const Py_UNICODE *unicode, Py_ssize_t size,
4163 Py_ssize_t startpos, Py_ssize_t endpos,
4164 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165{
4166 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170}
4171
4172/* error handling callback helper:
4173 build arguments, call the callback and check the arguments,
4174 put the result into newpos and return the replacement string, which
4175 has to be freed by the caller */
4176static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 PyObject **errorHandler,
4178 const char *encoding, const char *reason,
4179 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4180 Py_ssize_t startpos, Py_ssize_t endpos,
4181 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004183 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184
4185 PyObject *restuple;
4186 PyObject *resunicode;
4187
4188 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 }
4193
4194 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198
4199 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004204 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 Py_DECREF(restuple);
4206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004208 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004209 &resunicode, newpos)) {
4210 Py_DECREF(restuple);
4211 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004213 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4214 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4215 Py_DECREF(restuple);
4216 return NULL;
4217 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004220 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4222 Py_DECREF(restuple);
4223 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004224 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 Py_INCREF(resunicode);
4226 Py_DECREF(restuple);
4227 return resunicode;
4228}
4229
4230static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 Py_ssize_t size,
4232 const char *errors,
4233 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234{
4235 /* output object */
4236 PyObject *res;
4237 /* pointers to the beginning and end+1 of input */
4238 const Py_UNICODE *startp = p;
4239 const Py_UNICODE *endp = p + size;
4240 /* pointer to the beginning of the unencodable characters */
4241 /* const Py_UNICODE *badp = NULL; */
4242 /* pointer into the output */
4243 char *str;
4244 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004245 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004246 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4247 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 PyObject *errorHandler = NULL;
4249 PyObject *exc = NULL;
4250 /* the following variable is used for caching string comparisons
4251 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4252 int known_errorHandler = -1;
4253
4254 /* allocate enough for a simple encoding without
4255 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004256 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004257 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004258 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004260 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004261 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 ressize = size;
4263
4264 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 /* can we encode this? */
4268 if (c<limit) {
4269 /* no overflow check, because we know that the space is enough */
4270 *str++ = (char)c;
4271 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 else {
4274 Py_ssize_t unicodepos = p-startp;
4275 Py_ssize_t requiredsize;
4276 PyObject *repunicode;
4277 Py_ssize_t repsize;
4278 Py_ssize_t newpos;
4279 Py_ssize_t respos;
4280 Py_UNICODE *uni2;
4281 /* startpos for collecting unencodable chars */
4282 const Py_UNICODE *collstart = p;
4283 const Py_UNICODE *collend = p;
4284 /* find all unecodable characters */
4285 while ((collend < endp) && ((*collend)>=limit))
4286 ++collend;
4287 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4288 if (known_errorHandler==-1) {
4289 if ((errors==NULL) || (!strcmp(errors, "strict")))
4290 known_errorHandler = 1;
4291 else if (!strcmp(errors, "replace"))
4292 known_errorHandler = 2;
4293 else if (!strcmp(errors, "ignore"))
4294 known_errorHandler = 3;
4295 else if (!strcmp(errors, "xmlcharrefreplace"))
4296 known_errorHandler = 4;
4297 else
4298 known_errorHandler = 0;
4299 }
4300 switch (known_errorHandler) {
4301 case 1: /* strict */
4302 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4303 goto onError;
4304 case 2: /* replace */
4305 while (collstart++<collend)
4306 *str++ = '?'; /* fall through */
4307 case 3: /* ignore */
4308 p = collend;
4309 break;
4310 case 4: /* xmlcharrefreplace */
4311 respos = str - PyBytes_AS_STRING(res);
4312 /* determine replacement size (temporarily (mis)uses p) */
4313 for (p = collstart, repsize = 0; p < collend; ++p) {
4314 if (*p<10)
4315 repsize += 2+1+1;
4316 else if (*p<100)
4317 repsize += 2+2+1;
4318 else if (*p<1000)
4319 repsize += 2+3+1;
4320 else if (*p<10000)
4321 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004322#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 else
4324 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004325#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 else if (*p<100000)
4327 repsize += 2+5+1;
4328 else if (*p<1000000)
4329 repsize += 2+6+1;
4330 else
4331 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004332#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 }
4334 requiredsize = respos+repsize+(endp-collend);
4335 if (requiredsize > ressize) {
4336 if (requiredsize<2*ressize)
4337 requiredsize = 2*ressize;
4338 if (_PyBytes_Resize(&res, requiredsize))
4339 goto onError;
4340 str = PyBytes_AS_STRING(res) + respos;
4341 ressize = requiredsize;
4342 }
4343 /* generate replacement (temporarily (mis)uses p) */
4344 for (p = collstart; p < collend; ++p) {
4345 str += sprintf(str, "&#%d;", (int)*p);
4346 }
4347 p = collend;
4348 break;
4349 default:
4350 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4351 encoding, reason, startp, size, &exc,
4352 collstart-startp, collend-startp, &newpos);
4353 if (repunicode == NULL)
4354 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004355 if (PyBytes_Check(repunicode)) {
4356 /* Directly copy bytes result to output. */
4357 repsize = PyBytes_Size(repunicode);
4358 if (repsize > 1) {
4359 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004360 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004361 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4362 Py_DECREF(repunicode);
4363 goto onError;
4364 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004365 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004366 ressize += repsize-1;
4367 }
4368 memcpy(str, PyBytes_AsString(repunicode), repsize);
4369 str += repsize;
4370 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004371 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004372 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 /* need more space? (at least enough for what we
4375 have+the replacement+the rest of the string, so
4376 we won't have to check space for encodable characters) */
4377 respos = str - PyBytes_AS_STRING(res);
4378 repsize = PyUnicode_GET_SIZE(repunicode);
4379 requiredsize = respos+repsize+(endp-collend);
4380 if (requiredsize > ressize) {
4381 if (requiredsize<2*ressize)
4382 requiredsize = 2*ressize;
4383 if (_PyBytes_Resize(&res, requiredsize)) {
4384 Py_DECREF(repunicode);
4385 goto onError;
4386 }
4387 str = PyBytes_AS_STRING(res) + respos;
4388 ressize = requiredsize;
4389 }
4390 /* check if there is anything unencodable in the replacement
4391 and copy it to the output */
4392 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4393 c = *uni2;
4394 if (c >= limit) {
4395 raise_encode_exception(&exc, encoding, startp, size,
4396 unicodepos, unicodepos+1, reason);
4397 Py_DECREF(repunicode);
4398 goto onError;
4399 }
4400 *str = (char)c;
4401 }
4402 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004403 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004405 }
4406 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004407 /* Resize if we allocated to much */
4408 size = str - PyBytes_AS_STRING(res);
4409 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004410 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004411 if (_PyBytes_Resize(&res, size) < 0)
4412 goto onError;
4413 }
4414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 Py_XDECREF(errorHandler);
4416 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004417 return res;
4418
4419 onError:
4420 Py_XDECREF(res);
4421 Py_XDECREF(errorHandler);
4422 Py_XDECREF(exc);
4423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424}
4425
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 Py_ssize_t size,
4428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431}
4432
4433PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4434{
4435 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 PyErr_BadArgument();
4437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 }
4439 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 PyUnicode_GET_SIZE(unicode),
4441 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442}
4443
4444/* --- 7-bit ASCII Codec -------------------------------------------------- */
4445
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 Py_ssize_t size,
4448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 PyUnicodeObject *v;
4452 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004453 Py_ssize_t startinpos;
4454 Py_ssize_t endinpos;
4455 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 const char *e;
4457 PyObject *errorHandler = NULL;
4458 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004461 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 Py_UNICODE r = *(unsigned char*)s;
4463 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004464 }
Tim Petersced69f82003-09-16 20:30:58 +00004465
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 v = _PyUnicode_New(size);
4467 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 e = s + size;
4473 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 register unsigned char c = (unsigned char)*s;
4475 if (c < 128) {
4476 *p++ = c;
4477 ++s;
4478 }
4479 else {
4480 startinpos = s-starts;
4481 endinpos = startinpos + 1;
4482 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4483 if (unicode_decode_call_errorhandler(
4484 errors, &errorHandler,
4485 "ascii", "ordinal not in range(128)",
4486 &starts, &e, &startinpos, &endinpos, &exc, &s,
4487 &v, &outpos, &p))
4488 goto onError;
4489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004491 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4493 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(errorHandler);
4495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004497
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 Py_XDECREF(errorHandler);
4501 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 return NULL;
4503}
4504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 Py_ssize_t size,
4507 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510}
4511
4512PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4513{
4514 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 PyErr_BadArgument();
4516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 }
4518 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 PyUnicode_GET_SIZE(unicode),
4520 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521}
4522
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004523#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004524
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004525/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004526
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004527#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528#define NEED_RETRY
4529#endif
4530
4531/* XXX This code is limited to "true" double-byte encodings, as
4532 a) it assumes an incomplete character consists of a single byte, and
4533 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004535
4536static int is_dbcs_lead_byte(const char *s, int offset)
4537{
4538 const char *curr = s + offset;
4539
4540 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 const char *prev = CharPrev(s, curr);
4542 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004543 }
4544 return 0;
4545}
4546
4547/*
4548 * Decode MBCS string into unicode object. If 'final' is set, converts
4549 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4550 */
4551static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 const char *s, /* MBCS string */
4553 int size, /* sizeof MBCS string */
4554 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555{
4556 Py_UNICODE *p;
4557 Py_ssize_t n = 0;
4558 int usize = 0;
4559
4560 assert(size >= 0);
4561
4562 /* Skip trailing lead-byte unless 'final' is set */
4563 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565
4566 /* First get the size of the result */
4567 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4569 if (usize == 0) {
4570 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4571 return -1;
4572 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573 }
4574
4575 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 /* Create unicode object */
4577 *v = _PyUnicode_New(usize);
4578 if (*v == NULL)
4579 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580 }
4581 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 /* Extend unicode object */
4583 n = PyUnicode_GET_SIZE(*v);
4584 if (_PyUnicode_Resize(v, n + usize) < 0)
4585 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004586 }
4587
4588 /* Do the conversion */
4589 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 p = PyUnicode_AS_UNICODE(*v) + n;
4591 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4592 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4593 return -1;
4594 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004595 }
4596
4597 return size;
4598}
4599
4600PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 Py_ssize_t size,
4602 const char *errors,
4603 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004604{
4605 PyUnicodeObject *v = NULL;
4606 int done;
4607
4608 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004610
4611#ifdef NEED_RETRY
4612 retry:
4613 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004615 else
4616#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004618
4619 if (done < 0) {
4620 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004622 }
4623
4624 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004626
4627#ifdef NEED_RETRY
4628 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 s += done;
4630 size -= done;
4631 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632 }
4633#endif
4634
4635 return (PyObject *)v;
4636}
4637
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004638PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 Py_ssize_t size,
4640 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004641{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004642 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4643}
4644
4645/*
4646 * Convert unicode into string object (MBCS).
4647 * Returns 0 if succeed, -1 otherwise.
4648 */
4649static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 const Py_UNICODE *p, /* unicode */
4651 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004652{
4653 int mbcssize = 0;
4654 Py_ssize_t n = 0;
4655
4656 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004657
4658 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4661 if (mbcssize == 0) {
4662 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4663 return -1;
4664 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004665 }
4666
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004667 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 /* Create string object */
4669 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4670 if (*repr == NULL)
4671 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672 }
4673 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 /* Extend string object */
4675 n = PyBytes_Size(*repr);
4676 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4677 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004678 }
4679
4680 /* Do the conversion */
4681 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 char *s = PyBytes_AS_STRING(*repr) + n;
4683 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4684 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4685 return -1;
4686 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 }
4688
4689 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004690}
4691
4692PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 Py_ssize_t size,
4694 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004695{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004696 PyObject *repr = NULL;
4697 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004698
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004699#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004701 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004703 else
4704#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004706
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004707 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 Py_XDECREF(repr);
4709 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004710 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004711
4712#ifdef NEED_RETRY
4713 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 p += INT_MAX;
4715 size -= INT_MAX;
4716 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004717 }
4718#endif
4719
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004720 return repr;
4721}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004722
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004723PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4724{
4725 if (!PyUnicode_Check(unicode)) {
4726 PyErr_BadArgument();
4727 return NULL;
4728 }
4729 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 PyUnicode_GET_SIZE(unicode),
4731 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004732}
4733
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004734#undef NEED_RETRY
4735
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004736#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004737
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738/* --- Character Mapping Codec -------------------------------------------- */
4739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 Py_ssize_t size,
4742 PyObject *mapping,
4743 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t startinpos;
4747 Py_ssize_t endinpos;
4748 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 PyUnicodeObject *v;
4751 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 PyObject *errorHandler = NULL;
4754 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004755 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004757
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 /* Default to Latin-1 */
4759 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761
4762 v = _PyUnicode_New(size);
4763 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004769 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 mapstring = PyUnicode_AS_UNICODE(mapping);
4771 maplen = PyUnicode_GET_SIZE(mapping);
4772 while (s < e) {
4773 unsigned char ch = *s;
4774 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 if (ch < maplen)
4777 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 if (x == 0xfffe) {
4780 /* undefined mapping */
4781 outpos = p-PyUnicode_AS_UNICODE(v);
4782 startinpos = s-starts;
4783 endinpos = startinpos+1;
4784 if (unicode_decode_call_errorhandler(
4785 errors, &errorHandler,
4786 "charmap", "character maps to <undefined>",
4787 &starts, &e, &startinpos, &endinpos, &exc, &s,
4788 &v, &outpos, &p)) {
4789 goto onError;
4790 }
4791 continue;
4792 }
4793 *p++ = x;
4794 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004795 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004796 }
4797 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 while (s < e) {
4799 unsigned char ch = *s;
4800 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004801
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4803 w = PyLong_FromLong((long)ch);
4804 if (w == NULL)
4805 goto onError;
4806 x = PyObject_GetItem(mapping, w);
4807 Py_DECREF(w);
4808 if (x == NULL) {
4809 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4810 /* No mapping found means: mapping is undefined. */
4811 PyErr_Clear();
4812 x = Py_None;
4813 Py_INCREF(x);
4814 } else
4815 goto onError;
4816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004817
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 /* Apply mapping */
4819 if (PyLong_Check(x)) {
4820 long value = PyLong_AS_LONG(x);
4821 if (value < 0 || value > 65535) {
4822 PyErr_SetString(PyExc_TypeError,
4823 "character mapping must be in range(65536)");
4824 Py_DECREF(x);
4825 goto onError;
4826 }
4827 *p++ = (Py_UNICODE)value;
4828 }
4829 else if (x == Py_None) {
4830 /* undefined mapping */
4831 outpos = p-PyUnicode_AS_UNICODE(v);
4832 startinpos = s-starts;
4833 endinpos = startinpos+1;
4834 if (unicode_decode_call_errorhandler(
4835 errors, &errorHandler,
4836 "charmap", "character maps to <undefined>",
4837 &starts, &e, &startinpos, &endinpos, &exc, &s,
4838 &v, &outpos, &p)) {
4839 Py_DECREF(x);
4840 goto onError;
4841 }
4842 Py_DECREF(x);
4843 continue;
4844 }
4845 else if (PyUnicode_Check(x)) {
4846 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004847
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 if (targetsize == 1)
4849 /* 1-1 mapping */
4850 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004851
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 else if (targetsize > 1) {
4853 /* 1-n mapping */
4854 if (targetsize > extrachars) {
4855 /* resize first */
4856 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4857 Py_ssize_t needed = (targetsize - extrachars) + \
4858 (targetsize << 2);
4859 extrachars += needed;
4860 /* XXX overflow detection missing */
4861 if (_PyUnicode_Resize(&v,
4862 PyUnicode_GET_SIZE(v) + needed) < 0) {
4863 Py_DECREF(x);
4864 goto onError;
4865 }
4866 p = PyUnicode_AS_UNICODE(v) + oldpos;
4867 }
4868 Py_UNICODE_COPY(p,
4869 PyUnicode_AS_UNICODE(x),
4870 targetsize);
4871 p += targetsize;
4872 extrachars -= targetsize;
4873 }
4874 /* 1-0 mapping: skip the character */
4875 }
4876 else {
4877 /* wrong return value */
4878 PyErr_SetString(PyExc_TypeError,
4879 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004880 Py_DECREF(x);
4881 goto onError;
4882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 Py_DECREF(x);
4884 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 }
4887 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890 Py_XDECREF(errorHandler);
4891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004893
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 Py_XDECREF(errorHandler);
4896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 Py_XDECREF(v);
4898 return NULL;
4899}
4900
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004901/* Charmap encoding: the lookup table */
4902
4903struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 PyObject_HEAD
4905 unsigned char level1[32];
4906 int count2, count3;
4907 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004908};
4909
4910static PyObject*
4911encoding_map_size(PyObject *obj, PyObject* args)
4912{
4913 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004914 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004916}
4917
4918static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004919 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 PyDoc_STR("Return the size (in bytes) of this object") },
4921 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004922};
4923
4924static void
4925encoding_map_dealloc(PyObject* o)
4926{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004927 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004928}
4929
4930static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004931 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 "EncodingMap", /*tp_name*/
4933 sizeof(struct encoding_map), /*tp_basicsize*/
4934 0, /*tp_itemsize*/
4935 /* methods */
4936 encoding_map_dealloc, /*tp_dealloc*/
4937 0, /*tp_print*/
4938 0, /*tp_getattr*/
4939 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004940 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 0, /*tp_repr*/
4942 0, /*tp_as_number*/
4943 0, /*tp_as_sequence*/
4944 0, /*tp_as_mapping*/
4945 0, /*tp_hash*/
4946 0, /*tp_call*/
4947 0, /*tp_str*/
4948 0, /*tp_getattro*/
4949 0, /*tp_setattro*/
4950 0, /*tp_as_buffer*/
4951 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4952 0, /*tp_doc*/
4953 0, /*tp_traverse*/
4954 0, /*tp_clear*/
4955 0, /*tp_richcompare*/
4956 0, /*tp_weaklistoffset*/
4957 0, /*tp_iter*/
4958 0, /*tp_iternext*/
4959 encoding_map_methods, /*tp_methods*/
4960 0, /*tp_members*/
4961 0, /*tp_getset*/
4962 0, /*tp_base*/
4963 0, /*tp_dict*/
4964 0, /*tp_descr_get*/
4965 0, /*tp_descr_set*/
4966 0, /*tp_dictoffset*/
4967 0, /*tp_init*/
4968 0, /*tp_alloc*/
4969 0, /*tp_new*/
4970 0, /*tp_free*/
4971 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004972};
4973
4974PyObject*
4975PyUnicode_BuildEncodingMap(PyObject* string)
4976{
4977 Py_UNICODE *decode;
4978 PyObject *result;
4979 struct encoding_map *mresult;
4980 int i;
4981 int need_dict = 0;
4982 unsigned char level1[32];
4983 unsigned char level2[512];
4984 unsigned char *mlevel1, *mlevel2, *mlevel3;
4985 int count2 = 0, count3 = 0;
4986
4987 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4988 PyErr_BadArgument();
4989 return NULL;
4990 }
4991 decode = PyUnicode_AS_UNICODE(string);
4992 memset(level1, 0xFF, sizeof level1);
4993 memset(level2, 0xFF, sizeof level2);
4994
4995 /* If there isn't a one-to-one mapping of NULL to \0,
4996 or if there are non-BMP characters, we need to use
4997 a mapping dictionary. */
4998 if (decode[0] != 0)
4999 need_dict = 1;
5000 for (i = 1; i < 256; i++) {
5001 int l1, l2;
5002 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005003#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005004 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005005#endif
5006 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007 need_dict = 1;
5008 break;
5009 }
5010 if (decode[i] == 0xFFFE)
5011 /* unmapped character */
5012 continue;
5013 l1 = decode[i] >> 11;
5014 l2 = decode[i] >> 7;
5015 if (level1[l1] == 0xFF)
5016 level1[l1] = count2++;
5017 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005018 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005019 }
5020
5021 if (count2 >= 0xFF || count3 >= 0xFF)
5022 need_dict = 1;
5023
5024 if (need_dict) {
5025 PyObject *result = PyDict_New();
5026 PyObject *key, *value;
5027 if (!result)
5028 return NULL;
5029 for (i = 0; i < 256; i++) {
5030 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005031 key = PyLong_FromLong(decode[i]);
5032 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005033 if (!key || !value)
5034 goto failed1;
5035 if (PyDict_SetItem(result, key, value) == -1)
5036 goto failed1;
5037 Py_DECREF(key);
5038 Py_DECREF(value);
5039 }
5040 return result;
5041 failed1:
5042 Py_XDECREF(key);
5043 Py_XDECREF(value);
5044 Py_DECREF(result);
5045 return NULL;
5046 }
5047
5048 /* Create a three-level trie */
5049 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5050 16*count2 + 128*count3 - 1);
5051 if (!result)
5052 return PyErr_NoMemory();
5053 PyObject_Init(result, &EncodingMapType);
5054 mresult = (struct encoding_map*)result;
5055 mresult->count2 = count2;
5056 mresult->count3 = count3;
5057 mlevel1 = mresult->level1;
5058 mlevel2 = mresult->level23;
5059 mlevel3 = mresult->level23 + 16*count2;
5060 memcpy(mlevel1, level1, 32);
5061 memset(mlevel2, 0xFF, 16*count2);
5062 memset(mlevel3, 0, 128*count3);
5063 count3 = 0;
5064 for (i = 1; i < 256; i++) {
5065 int o1, o2, o3, i2, i3;
5066 if (decode[i] == 0xFFFE)
5067 /* unmapped character */
5068 continue;
5069 o1 = decode[i]>>11;
5070 o2 = (decode[i]>>7) & 0xF;
5071 i2 = 16*mlevel1[o1] + o2;
5072 if (mlevel2[i2] == 0xFF)
5073 mlevel2[i2] = count3++;
5074 o3 = decode[i] & 0x7F;
5075 i3 = 128*mlevel2[i2] + o3;
5076 mlevel3[i3] = i;
5077 }
5078 return result;
5079}
5080
5081static int
5082encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5083{
5084 struct encoding_map *map = (struct encoding_map*)mapping;
5085 int l1 = c>>11;
5086 int l2 = (c>>7) & 0xF;
5087 int l3 = c & 0x7F;
5088 int i;
5089
5090#ifdef Py_UNICODE_WIDE
5091 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005093 }
5094#endif
5095 if (c == 0)
5096 return 0;
5097 /* level 1*/
5098 i = map->level1[l1];
5099 if (i == 0xFF) {
5100 return -1;
5101 }
5102 /* level 2*/
5103 i = map->level23[16*i+l2];
5104 if (i == 0xFF) {
5105 return -1;
5106 }
5107 /* level 3 */
5108 i = map->level23[16*map->count2 + 128*i + l3];
5109 if (i == 0) {
5110 return -1;
5111 }
5112 return i;
5113}
5114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115/* Lookup the character ch in the mapping. If the character
5116 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005117 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119{
Christian Heimes217cfd12007-12-02 14:31:20 +00005120 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005121 PyObject *x;
5122
5123 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 x = PyObject_GetItem(mapping, w);
5126 Py_DECREF(w);
5127 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5129 /* No mapping found means: mapping is undefined. */
5130 PyErr_Clear();
5131 x = Py_None;
5132 Py_INCREF(x);
5133 return x;
5134 } else
5135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005137 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005139 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 long value = PyLong_AS_LONG(x);
5141 if (value < 0 || value > 255) {
5142 PyErr_SetString(PyExc_TypeError,
5143 "character mapping must be in range(256)");
5144 Py_DECREF(x);
5145 return NULL;
5146 }
5147 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005149 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 /* wrong return value */
5153 PyErr_Format(PyExc_TypeError,
5154 "character mapping must return integer, bytes or None, not %.400s",
5155 x->ob_type->tp_name);
5156 Py_DECREF(x);
5157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 }
5159}
5160
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005162charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005163{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005164 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5165 /* exponentially overallocate to minimize reallocations */
5166 if (requiredsize < 2*outsize)
5167 requiredsize = 2*outsize;
5168 if (_PyBytes_Resize(outobj, requiredsize))
5169 return -1;
5170 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005171}
5172
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005175}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005177 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 space is available. Return a new reference to the object that
5179 was put in the output buffer, or Py_None, if the mapping was undefined
5180 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005181 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005183charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005186 PyObject *rep;
5187 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005188 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189
Christian Heimes90aa7642007-12-19 02:45:37 +00005190 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005191 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005193 if (res == -1)
5194 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 if (outsize<requiredsize)
5196 if (charmapencode_resize(outobj, outpos, requiredsize))
5197 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005198 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 outstart[(*outpos)++] = (char)res;
5200 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005201 }
5202
5203 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005206 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 Py_DECREF(rep);
5208 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005209 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 if (PyLong_Check(rep)) {
5211 Py_ssize_t requiredsize = *outpos+1;
5212 if (outsize<requiredsize)
5213 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5214 Py_DECREF(rep);
5215 return enc_EXCEPTION;
5216 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005217 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 else {
5221 const char *repchars = PyBytes_AS_STRING(rep);
5222 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5223 Py_ssize_t requiredsize = *outpos+repsize;
5224 if (outsize<requiredsize)
5225 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5226 Py_DECREF(rep);
5227 return enc_EXCEPTION;
5228 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005229 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 memcpy(outstart + *outpos, repchars, repsize);
5231 *outpos += repsize;
5232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005234 Py_DECREF(rep);
5235 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236}
5237
5238/* handle an error in PyUnicode_EncodeCharmap
5239 Return 0 on success, -1 on error */
5240static
5241int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005244 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005245 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246{
5247 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 Py_ssize_t repsize;
5249 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 Py_UNICODE *uni2;
5251 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005252 Py_ssize_t collstartpos = *inpos;
5253 Py_ssize_t collendpos = *inpos+1;
5254 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255 char *encoding = "charmap";
5256 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005257 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259 /* find all unencodable characters */
5260 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005261 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005262 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 int res = encoding_map_lookup(p[collendpos], mapping);
5264 if (res != -1)
5265 break;
5266 ++collendpos;
5267 continue;
5268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 rep = charmapencode_lookup(p[collendpos], mapping);
5271 if (rep==NULL)
5272 return -1;
5273 else if (rep!=Py_None) {
5274 Py_DECREF(rep);
5275 break;
5276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 }
5280 /* cache callback name lookup
5281 * (if not done yet, i.e. it's the first error) */
5282 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 if ((errors==NULL) || (!strcmp(errors, "strict")))
5284 *known_errorHandler = 1;
5285 else if (!strcmp(errors, "replace"))
5286 *known_errorHandler = 2;
5287 else if (!strcmp(errors, "ignore"))
5288 *known_errorHandler = 3;
5289 else if (!strcmp(errors, "xmlcharrefreplace"))
5290 *known_errorHandler = 4;
5291 else
5292 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293 }
5294 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005295 case 1: /* strict */
5296 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5297 return -1;
5298 case 2: /* replace */
5299 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 x = charmapencode_output('?', mapping, res, respos);
5301 if (x==enc_EXCEPTION) {
5302 return -1;
5303 }
5304 else if (x==enc_FAILED) {
5305 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5306 return -1;
5307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005308 }
5309 /* fall through */
5310 case 3: /* ignore */
5311 *inpos = collendpos;
5312 break;
5313 case 4: /* xmlcharrefreplace */
5314 /* generate replacement (temporarily (mis)uses p) */
5315 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 char buffer[2+29+1+1];
5317 char *cp;
5318 sprintf(buffer, "&#%d;", (int)p[collpos]);
5319 for (cp = buffer; *cp; ++cp) {
5320 x = charmapencode_output(*cp, mapping, res, respos);
5321 if (x==enc_EXCEPTION)
5322 return -1;
5323 else if (x==enc_FAILED) {
5324 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5325 return -1;
5326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005327 }
5328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329 *inpos = collendpos;
5330 break;
5331 default:
5332 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 encoding, reason, p, size, exceptionObject,
5334 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005335 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005337 if (PyBytes_Check(repunicode)) {
5338 /* Directly copy bytes result to output. */
5339 Py_ssize_t outsize = PyBytes_Size(*res);
5340 Py_ssize_t requiredsize;
5341 repsize = PyBytes_Size(repunicode);
5342 requiredsize = *respos + repsize;
5343 if (requiredsize > outsize)
5344 /* Make room for all additional bytes. */
5345 if (charmapencode_resize(res, respos, requiredsize)) {
5346 Py_DECREF(repunicode);
5347 return -1;
5348 }
5349 memcpy(PyBytes_AsString(*res) + *respos,
5350 PyBytes_AsString(repunicode), repsize);
5351 *respos += repsize;
5352 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005353 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005354 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005356 /* generate replacement */
5357 repsize = PyUnicode_GET_SIZE(repunicode);
5358 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 x = charmapencode_output(*uni2, mapping, res, respos);
5360 if (x==enc_EXCEPTION) {
5361 return -1;
5362 }
5363 else if (x==enc_FAILED) {
5364 Py_DECREF(repunicode);
5365 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5366 return -1;
5367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005368 }
5369 *inpos = newpos;
5370 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 }
5372 return 0;
5373}
5374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 Py_ssize_t size,
5377 PyObject *mapping,
5378 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 /* output object */
5381 PyObject *res = NULL;
5382 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 PyObject *errorHandler = NULL;
5387 PyObject *exc = NULL;
5388 /* the following variable is used for caching string comparisons
5389 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5390 * 3=ignore, 4=xmlcharrefreplace */
5391 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393 /* Default to Latin-1 */
5394 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 /* allocate enough for a simple encoding without
5398 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005399 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 if (res == NULL)
5401 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005402 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 /* try to encode it */
5407 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5408 if (x==enc_EXCEPTION) /* error */
5409 goto onError;
5410 if (x==enc_FAILED) { /* unencodable character */
5411 if (charmap_encoding_error(p, size, &inpos, mapping,
5412 &exc,
5413 &known_errorHandler, &errorHandler, errors,
5414 &res, &respos)) {
5415 goto onError;
5416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005417 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 else
5419 /* done with this character => adjust input position */
5420 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005424 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005425 if (_PyBytes_Resize(&res, respos) < 0)
5426 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 Py_XDECREF(exc);
5429 Py_XDECREF(errorHandler);
5430 return res;
5431
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 Py_XDECREF(res);
5434 Py_XDECREF(exc);
5435 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 return NULL;
5437}
5438
5439PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441{
5442 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 PyErr_BadArgument();
5444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 }
5446 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 PyUnicode_GET_SIZE(unicode),
5448 mapping,
5449 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450}
5451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452/* create or adjust a UnicodeTranslateError */
5453static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 const Py_UNICODE *unicode, Py_ssize_t size,
5455 Py_ssize_t startpos, Py_ssize_t endpos,
5456 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005459 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 }
5462 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5464 goto onError;
5465 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5466 goto onError;
5467 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5468 goto onError;
5469 return;
5470 onError:
5471 Py_DECREF(*exceptionObject);
5472 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 }
5474}
5475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476/* raises a UnicodeTranslateError */
5477static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 const Py_UNICODE *unicode, Py_ssize_t size,
5479 Py_ssize_t startpos, Py_ssize_t endpos,
5480 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481{
5482 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486}
5487
5488/* error handling callback helper:
5489 build arguments, call the callback and check the arguments,
5490 put the result into newpos and return the replacement string, which
5491 has to be freed by the caller */
5492static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 PyObject **errorHandler,
5494 const char *reason,
5495 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5496 Py_ssize_t startpos, Py_ssize_t endpos,
5497 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005499 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005501 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 PyObject *restuple;
5503 PyObject *resunicode;
5504
5505 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 }
5510
5511 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515
5516 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005521 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 Py_DECREF(restuple);
5523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 }
5525 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 &resunicode, &i_newpos)) {
5527 Py_DECREF(restuple);
5528 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005530 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005532 else
5533 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005534 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5536 Py_DECREF(restuple);
5537 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005538 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 Py_INCREF(resunicode);
5540 Py_DECREF(restuple);
5541 return resunicode;
5542}
5543
5544/* Lookup the character ch in the mapping and put the result in result,
5545 which must be decrefed by the caller.
5546 Return 0 on success, -1 on error */
5547static
5548int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5549{
Christian Heimes217cfd12007-12-02 14:31:20 +00005550 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 PyObject *x;
5552
5553 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 x = PyObject_GetItem(mapping, w);
5556 Py_DECREF(w);
5557 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5559 /* No mapping found means: use 1:1 mapping. */
5560 PyErr_Clear();
5561 *result = NULL;
5562 return 0;
5563 } else
5564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 }
5566 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 *result = x;
5568 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005570 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 long value = PyLong_AS_LONG(x);
5572 long max = PyUnicode_GetMax();
5573 if (value < 0 || value > max) {
5574 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005575 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 Py_DECREF(x);
5577 return -1;
5578 }
5579 *result = x;
5580 return 0;
5581 }
5582 else if (PyUnicode_Check(x)) {
5583 *result = x;
5584 return 0;
5585 }
5586 else {
5587 /* wrong return value */
5588 PyErr_SetString(PyExc_TypeError,
5589 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005590 Py_DECREF(x);
5591 return -1;
5592 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593}
5594/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 if not reallocate and adjust various state variables.
5596 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597static
Walter Dörwald4894c302003-10-24 14:25:28 +00005598int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005602 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 /* remember old output position */
5604 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5605 /* exponentially overallocate to minimize reallocations */
5606 if (requiredsize < 2 * oldsize)
5607 requiredsize = 2 * oldsize;
5608 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5609 return -1;
5610 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 }
5612 return 0;
5613}
5614/* lookup the character, put the result in the output string and adjust
5615 various state variables. Return a new reference to the object that
5616 was put in the output buffer in *result, or Py_None, if the mapping was
5617 undefined (in which case no character was written).
5618 The called must decref result.
5619 Return 0 on success, -1 on error. */
5620static
Walter Dörwald4894c302003-10-24 14:25:28 +00005621int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5623 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624{
Walter Dörwald4894c302003-10-24 14:25:28 +00005625 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 /* not found => default to 1:1 mapping */
5629 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 }
5631 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005633 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* no overflow check, because we know that the space is enough */
5635 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 }
5637 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5639 if (repsize==1) {
5640 /* no overflow check, because we know that the space is enough */
5641 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5642 }
5643 else if (repsize!=0) {
5644 /* more than one character */
5645 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5646 (insize - (curinp-startinp)) +
5647 repsize - 1;
5648 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5649 return -1;
5650 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5651 *outp += repsize;
5652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 }
5654 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 return 0;
5657}
5658
5659PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 Py_ssize_t size,
5661 PyObject *mapping,
5662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 /* output object */
5665 PyObject *res = NULL;
5666 /* pointers to the beginning and end+1 of input */
5667 const Py_UNICODE *startp = p;
5668 const Py_UNICODE *endp = p + size;
5669 /* pointer into the output */
5670 Py_UNICODE *str;
5671 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005672 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 char *reason = "character maps to <undefined>";
5674 PyObject *errorHandler = NULL;
5675 PyObject *exc = NULL;
5676 /* the following variable is used for caching string comparisons
5677 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5678 * 3=ignore, 4=xmlcharrefreplace */
5679 int known_errorHandler = -1;
5680
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 PyErr_BadArgument();
5683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685
5686 /* allocate enough for a simple 1:1 translation without
5687 replacements, if we need more, we'll resize */
5688 res = PyUnicode_FromUnicode(NULL, size);
5689 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 /* try to encode it */
5697 PyObject *x = NULL;
5698 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5699 Py_XDECREF(x);
5700 goto onError;
5701 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005702 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 if (x!=Py_None) /* it worked => adjust input pointer */
5704 ++p;
5705 else { /* untranslatable character */
5706 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5707 Py_ssize_t repsize;
5708 Py_ssize_t newpos;
5709 Py_UNICODE *uni2;
5710 /* startpos for collecting untranslatable chars */
5711 const Py_UNICODE *collstart = p;
5712 const Py_UNICODE *collend = p+1;
5713 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 /* find all untranslatable characters */
5716 while (collend < endp) {
5717 if (charmaptranslate_lookup(*collend, mapping, &x))
5718 goto onError;
5719 Py_XDECREF(x);
5720 if (x!=Py_None)
5721 break;
5722 ++collend;
5723 }
5724 /* cache callback name lookup
5725 * (if not done yet, i.e. it's the first error) */
5726 if (known_errorHandler==-1) {
5727 if ((errors==NULL) || (!strcmp(errors, "strict")))
5728 known_errorHandler = 1;
5729 else if (!strcmp(errors, "replace"))
5730 known_errorHandler = 2;
5731 else if (!strcmp(errors, "ignore"))
5732 known_errorHandler = 3;
5733 else if (!strcmp(errors, "xmlcharrefreplace"))
5734 known_errorHandler = 4;
5735 else
5736 known_errorHandler = 0;
5737 }
5738 switch (known_errorHandler) {
5739 case 1: /* strict */
5740 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005741 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 case 2: /* replace */
5743 /* No need to check for space, this is a 1:1 replacement */
5744 for (coll = collstart; coll<collend; ++coll)
5745 *str++ = '?';
5746 /* fall through */
5747 case 3: /* ignore */
5748 p = collend;
5749 break;
5750 case 4: /* xmlcharrefreplace */
5751 /* generate replacement (temporarily (mis)uses p) */
5752 for (p = collstart; p < collend; ++p) {
5753 char buffer[2+29+1+1];
5754 char *cp;
5755 sprintf(buffer, "&#%d;", (int)*p);
5756 if (charmaptranslate_makespace(&res, &str,
5757 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5758 goto onError;
5759 for (cp = buffer; *cp; ++cp)
5760 *str++ = *cp;
5761 }
5762 p = collend;
5763 break;
5764 default:
5765 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5766 reason, startp, size, &exc,
5767 collstart-startp, collend-startp, &newpos);
5768 if (repunicode == NULL)
5769 goto onError;
5770 /* generate replacement */
5771 repsize = PyUnicode_GET_SIZE(repunicode);
5772 if (charmaptranslate_makespace(&res, &str,
5773 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5774 Py_DECREF(repunicode);
5775 goto onError;
5776 }
5777 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5778 *str++ = *uni2;
5779 p = startp + newpos;
5780 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005782 }
5783 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 /* Resize if we allocated to much */
5785 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005786 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 if (PyUnicode_Resize(&res, respos) < 0)
5788 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 }
5790 Py_XDECREF(exc);
5791 Py_XDECREF(errorHandler);
5792 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_XDECREF(res);
5796 Py_XDECREF(exc);
5797 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 return NULL;
5799}
5800
5801PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 PyObject *mapping,
5803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804{
5805 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005806
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 str = PyUnicode_FromObject(str);
5808 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 PyUnicode_GET_SIZE(str),
5812 mapping,
5813 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 Py_DECREF(str);
5815 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005816
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 Py_XDECREF(str);
5819 return NULL;
5820}
Tim Petersced69f82003-09-16 20:30:58 +00005821
Guido van Rossum9e896b32000-04-05 20:11:21 +00005822/* --- Decimal Encoder ---------------------------------------------------- */
5823
5824int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 Py_ssize_t length,
5826 char *output,
5827 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005828{
5829 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 PyObject *errorHandler = NULL;
5831 PyObject *exc = NULL;
5832 const char *encoding = "decimal";
5833 const char *reason = "invalid decimal Unicode string";
5834 /* the following variable is used for caching string comparisons
5835 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5836 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005837
5838 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 PyErr_BadArgument();
5840 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005841 }
5842
5843 p = s;
5844 end = s + length;
5845 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 register Py_UNICODE ch = *p;
5847 int decimal;
5848 PyObject *repunicode;
5849 Py_ssize_t repsize;
5850 Py_ssize_t newpos;
5851 Py_UNICODE *uni2;
5852 Py_UNICODE *collstart;
5853 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005856 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 ++p;
5858 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005859 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 decimal = Py_UNICODE_TODECIMAL(ch);
5861 if (decimal >= 0) {
5862 *output++ = '0' + decimal;
5863 ++p;
5864 continue;
5865 }
5866 if (0 < ch && ch < 256) {
5867 *output++ = (char)ch;
5868 ++p;
5869 continue;
5870 }
5871 /* All other characters are considered unencodable */
5872 collstart = p;
5873 collend = p+1;
5874 while (collend < end) {
5875 if ((0 < *collend && *collend < 256) ||
5876 !Py_UNICODE_ISSPACE(*collend) ||
5877 Py_UNICODE_TODECIMAL(*collend))
5878 break;
5879 }
5880 /* cache callback name lookup
5881 * (if not done yet, i.e. it's the first error) */
5882 if (known_errorHandler==-1) {
5883 if ((errors==NULL) || (!strcmp(errors, "strict")))
5884 known_errorHandler = 1;
5885 else if (!strcmp(errors, "replace"))
5886 known_errorHandler = 2;
5887 else if (!strcmp(errors, "ignore"))
5888 known_errorHandler = 3;
5889 else if (!strcmp(errors, "xmlcharrefreplace"))
5890 known_errorHandler = 4;
5891 else
5892 known_errorHandler = 0;
5893 }
5894 switch (known_errorHandler) {
5895 case 1: /* strict */
5896 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5897 goto onError;
5898 case 2: /* replace */
5899 for (p = collstart; p < collend; ++p)
5900 *output++ = '?';
5901 /* fall through */
5902 case 3: /* ignore */
5903 p = collend;
5904 break;
5905 case 4: /* xmlcharrefreplace */
5906 /* generate replacement (temporarily (mis)uses p) */
5907 for (p = collstart; p < collend; ++p)
5908 output += sprintf(output, "&#%d;", (int)*p);
5909 p = collend;
5910 break;
5911 default:
5912 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5913 encoding, reason, s, length, &exc,
5914 collstart-s, collend-s, &newpos);
5915 if (repunicode == NULL)
5916 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005917 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005918 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005919 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5920 Py_DECREF(repunicode);
5921 goto onError;
5922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 /* generate replacement */
5924 repsize = PyUnicode_GET_SIZE(repunicode);
5925 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5926 Py_UNICODE ch = *uni2;
5927 if (Py_UNICODE_ISSPACE(ch))
5928 *output++ = ' ';
5929 else {
5930 decimal = Py_UNICODE_TODECIMAL(ch);
5931 if (decimal >= 0)
5932 *output++ = '0' + decimal;
5933 else if (0 < ch && ch < 256)
5934 *output++ = (char)ch;
5935 else {
5936 Py_DECREF(repunicode);
5937 raise_encode_exception(&exc, encoding,
5938 s, length, collstart-s, collend-s, reason);
5939 goto onError;
5940 }
5941 }
5942 }
5943 p = s + newpos;
5944 Py_DECREF(repunicode);
5945 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005946 }
5947 /* 0-terminate the output string */
5948 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 Py_XDECREF(exc);
5950 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005951 return 0;
5952
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005954 Py_XDECREF(exc);
5955 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005956 return -1;
5957}
5958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959/* --- Helpers ------------------------------------------------------------ */
5960
Eric Smith8c663262007-08-25 02:26:07 +00005961#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005963
Thomas Wouters477c8d52006-05-27 19:21:47 +00005964#include "stringlib/count.h"
5965#include "stringlib/find.h"
5966#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005967#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005968
Eric Smith5807c412008-05-11 21:00:57 +00005969#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005970#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005971#include "stringlib/localeutil.h"
5972
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005974#define ADJUST_INDICES(start, end, len) \
5975 if (end > len) \
5976 end = len; \
5977 else if (end < 0) { \
5978 end += len; \
5979 if (end < 0) \
5980 end = 0; \
5981 } \
5982 if (start < 0) { \
5983 start += len; \
5984 if (start < 0) \
5985 start = 0; \
5986 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005987
Martin v. Löwis18e16552006-02-15 17:27:45 +00005988Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 PyObject *substr,
5990 Py_ssize_t start,
5991 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005993 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005994 PyUnicodeObject* str_obj;
5995 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005996
Thomas Wouters477c8d52006-05-27 19:21:47 +00005997 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5998 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006000 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6001 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 Py_DECREF(str_obj);
6003 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 }
Tim Petersced69f82003-09-16 20:30:58 +00006005
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006006 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006007 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006008 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6009 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 );
6011
6012 Py_DECREF(sub_obj);
6013 Py_DECREF(str_obj);
6014
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 return result;
6016}
6017
Martin v. Löwis18e16552006-02-15 17:27:45 +00006018Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 PyObject *sub,
6020 Py_ssize_t start,
6021 Py_ssize_t end,
6022 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006027 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006029 sub = PyUnicode_FromObject(sub);
6030 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 Py_DECREF(str);
6032 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 }
Tim Petersced69f82003-09-16 20:30:58 +00006034
Thomas Wouters477c8d52006-05-27 19:21:47 +00006035 if (direction > 0)
6036 result = stringlib_find_slice(
6037 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6038 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6039 start, end
6040 );
6041 else
6042 result = stringlib_rfind_slice(
6043 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6044 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6045 start, end
6046 );
6047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006049 Py_DECREF(sub);
6050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 return result;
6052}
6053
Tim Petersced69f82003-09-16 20:30:58 +00006054static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 PyUnicodeObject *substring,
6057 Py_ssize_t start,
6058 Py_ssize_t end,
6059 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (substring->length == 0)
6062 return 1;
6063
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006064 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 end -= substring->length;
6066 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
6069 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 if (Py_UNICODE_MATCH(self, end, substring))
6071 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 } else {
6073 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 }
6076
6077 return 0;
6078}
6079
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 PyObject *substr,
6082 Py_ssize_t start,
6083 Py_ssize_t end,
6084 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006086 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 str = PyUnicode_FromObject(str);
6089 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 substr = PyUnicode_FromObject(substr);
6092 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 Py_DECREF(str);
6094 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Tim Petersced69f82003-09-16 20:30:58 +00006096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 (PyUnicodeObject *)substr,
6099 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 Py_DECREF(str);
6101 Py_DECREF(substr);
6102 return result;
6103}
6104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105/* Apply fixfct filter to the Unicode object self and return a
6106 reference to the modified object */
6107
Tim Petersced69f82003-09-16 20:30:58 +00006108static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111{
6112
6113 PyUnicodeObject *u;
6114
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006115 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006118
6119 Py_UNICODE_COPY(u->str, self->str, self->length);
6120
Tim Peters7a29bd52001-09-12 03:03:31 +00006121 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 /* fixfct should return TRUE if it modified the buffer. If
6123 FALSE, return a reference to the original buffer instead
6124 (to save space, not time) */
6125 Py_INCREF(self);
6126 Py_DECREF(u);
6127 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 }
6129 return (PyObject*) u;
6130}
6131
Tim Petersced69f82003-09-16 20:30:58 +00006132static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133int fixupper(PyUnicodeObject *self)
6134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006135 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 Py_UNICODE *s = self->str;
6137 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006138
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006141
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 ch = Py_UNICODE_TOUPPER(*s);
6143 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 *s = ch;
6146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 s++;
6148 }
6149
6150 return status;
6151}
6152
Tim Petersced69f82003-09-16 20:30:58 +00006153static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154int fixlower(PyUnicodeObject *self)
6155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006156 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_UNICODE *s = self->str;
6158 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006162
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 ch = Py_UNICODE_TOLOWER(*s);
6164 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 *s = ch;
6167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 s++;
6169 }
6170
6171 return status;
6172}
6173
Tim Petersced69f82003-09-16 20:30:58 +00006174static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175int fixswapcase(PyUnicodeObject *self)
6176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 Py_UNICODE *s = self->str;
6179 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006180
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 while (len-- > 0) {
6182 if (Py_UNICODE_ISUPPER(*s)) {
6183 *s = Py_UNICODE_TOLOWER(*s);
6184 status = 1;
6185 } else if (Py_UNICODE_ISLOWER(*s)) {
6186 *s = Py_UNICODE_TOUPPER(*s);
6187 status = 1;
6188 }
6189 s++;
6190 }
6191
6192 return status;
6193}
6194
Tim Petersced69f82003-09-16 20:30:58 +00006195static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196int fixcapitalize(PyUnicodeObject *self)
6197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006198 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006199 Py_UNICODE *s = self->str;
6200 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006201
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006202 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006204 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 *s = Py_UNICODE_TOUPPER(*s);
6206 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006208 s++;
6209 while (--len > 0) {
6210 if (Py_UNICODE_ISUPPER(*s)) {
6211 *s = Py_UNICODE_TOLOWER(*s);
6212 status = 1;
6213 }
6214 s++;
6215 }
6216 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
6219static
6220int fixtitle(PyUnicodeObject *self)
6221{
6222 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6223 register Py_UNICODE *e;
6224 int previous_is_cased;
6225
6226 /* Shortcut for single character strings */
6227 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6229 if (*p != ch) {
6230 *p = ch;
6231 return 1;
6232 }
6233 else
6234 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 }
Tim Petersced69f82003-09-16 20:30:58 +00006236
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 e = p + PyUnicode_GET_SIZE(self);
6238 previous_is_cased = 0;
6239 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 if (previous_is_cased)
6243 *p = Py_UNICODE_TOLOWER(ch);
6244 else
6245 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006246
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 if (Py_UNICODE_ISLOWER(ch) ||
6248 Py_UNICODE_ISUPPER(ch) ||
6249 Py_UNICODE_ISTITLE(ch))
6250 previous_is_cased = 1;
6251 else
6252 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 }
6254 return 1;
6255}
6256
Tim Peters8ce9f162004-08-27 01:49:32 +00006257PyObject *
6258PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259{
Skip Montanaro6543b452004-09-16 03:28:13 +00006260 const Py_UNICODE blank = ' ';
6261 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006263 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006264 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6265 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006266 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6267 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006268 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006269 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
Tim Peters05eba1f2004-08-27 21:32:02 +00006271 fseq = PySequence_Fast(seq, "");
6272 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006273 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006274 }
6275
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006276 /* NOTE: the following code can't call back into Python code,
6277 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006278 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006279
Tim Peters05eba1f2004-08-27 21:32:02 +00006280 seqlen = PySequence_Fast_GET_SIZE(fseq);
6281 /* If empty sequence, return u"". */
6282 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006283 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6284 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006285 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006286 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006287 /* If singleton sequence with an exact Unicode, return that. */
6288 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 item = items[0];
6290 if (PyUnicode_CheckExact(item)) {
6291 Py_INCREF(item);
6292 res = (PyUnicodeObject *)item;
6293 goto Done;
6294 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006295 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006296 else {
6297 /* Set up sep and seplen */
6298 if (separator == NULL) {
6299 sep = &blank;
6300 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006301 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006302 else {
6303 if (!PyUnicode_Check(separator)) {
6304 PyErr_Format(PyExc_TypeError,
6305 "separator: expected str instance,"
6306 " %.80s found",
6307 Py_TYPE(separator)->tp_name);
6308 goto onError;
6309 }
6310 sep = PyUnicode_AS_UNICODE(separator);
6311 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006312 }
6313 }
6314
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006315 /* There are at least two things to join, or else we have a subclass
6316 * of str in the sequence.
6317 * Do a pre-pass to figure out the total amount of space we'll
6318 * need (sz), and see whether all argument are strings.
6319 */
6320 sz = 0;
6321 for (i = 0; i < seqlen; i++) {
6322 const Py_ssize_t old_sz = sz;
6323 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 if (!PyUnicode_Check(item)) {
6325 PyErr_Format(PyExc_TypeError,
6326 "sequence item %zd: expected str instance,"
6327 " %.80s found",
6328 i, Py_TYPE(item)->tp_name);
6329 goto onError;
6330 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006331 sz += PyUnicode_GET_SIZE(item);
6332 if (i != 0)
6333 sz += seplen;
6334 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6335 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006337 goto onError;
6338 }
6339 }
Tim Petersced69f82003-09-16 20:30:58 +00006340
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006341 res = _PyUnicode_New(sz);
6342 if (res == NULL)
6343 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006344
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006345 /* Catenate everything. */
6346 res_p = PyUnicode_AS_UNICODE(res);
6347 for (i = 0; i < seqlen; ++i) {
6348 Py_ssize_t itemlen;
6349 item = items[i];
6350 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 /* Copy item, and maybe the separator. */
6352 if (i) {
6353 Py_UNICODE_COPY(res_p, sep, seplen);
6354 res_p += seplen;
6355 }
6356 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6357 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006358 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006359
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006361 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 return (PyObject *)res;
6363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006365 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006366 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 return NULL;
6368}
6369
Tim Petersced69f82003-09-16 20:30:58 +00006370static
6371PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 Py_ssize_t left,
6373 Py_ssize_t right,
6374 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 PyUnicodeObject *u;
6377
6378 if (left < 0)
6379 left = 0;
6380 if (right < 0)
6381 right = 0;
6382
Tim Peters7a29bd52001-09-12 03:03:31 +00006383 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 Py_INCREF(self);
6385 return self;
6386 }
6387
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006388 if (left > PY_SSIZE_T_MAX - self->length ||
6389 right > PY_SSIZE_T_MAX - (left + self->length)) {
6390 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6391 return NULL;
6392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 u = _PyUnicode_New(left + self->length + right);
6394 if (u) {
6395 if (left)
6396 Py_UNICODE_FILL(u->str, fill, left);
6397 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6398 if (right)
6399 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6400 }
6401
6402 return u;
6403}
6404
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006405PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 string = PyUnicode_FromObject(string);
6410 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006413 list = stringlib_splitlines(
6414 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6415 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
6417 Py_DECREF(string);
6418 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419}
6420
Tim Petersced69f82003-09-16 20:30:58 +00006421static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 PyUnicodeObject *substring,
6424 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006427 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006430 return stringlib_split_whitespace(
6431 (PyObject*) self, self->str, self->length, maxcount
6432 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006434 return stringlib_split(
6435 (PyObject*) self, self->str, self->length,
6436 substring->str, substring->length,
6437 maxcount
6438 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439}
6440
Tim Petersced69f82003-09-16 20:30:58 +00006441static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006442PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 PyUnicodeObject *substring,
6444 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006445{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006447 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006448
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006449 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006450 return stringlib_rsplit_whitespace(
6451 (PyObject*) self, self->str, self->length, maxcount
6452 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006453
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006454 return stringlib_rsplit(
6455 (PyObject*) self, self->str, self->length,
6456 substring->str, substring->length,
6457 maxcount
6458 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006459}
6460
6461static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 PyUnicodeObject *str1,
6464 PyUnicodeObject *str2,
6465 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466{
6467 PyUnicodeObject *u;
6468
6469 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006471 else if (maxcount == 0 || self->length == 0)
6472 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
Thomas Wouters477c8d52006-05-27 19:21:47 +00006474 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006475 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006477 if (str1->length == 0)
6478 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479 if (str1->length == 1) {
6480 /* replace characters */
6481 Py_UNICODE u1, u2;
6482 if (!findchar(self->str, self->length, str1->str[0]))
6483 goto nothing;
6484 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6485 if (!u)
6486 return NULL;
6487 Py_UNICODE_COPY(u->str, self->str, self->length);
6488 u1 = str1->str[0];
6489 u2 = str2->str[0];
6490 for (i = 0; i < u->length; i++)
6491 if (u->str[i] == u1) {
6492 if (--maxcount < 0)
6493 break;
6494 u->str[i] = u2;
6495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006497 i = stringlib_find(
6498 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006500 if (i < 0)
6501 goto nothing;
6502 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6503 if (!u)
6504 return NULL;
6505 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006506
6507 /* change everything in-place, starting with this one */
6508 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6509 i += str1->length;
6510
6511 while ( --maxcount > 0) {
6512 i = stringlib_find(self->str+i, self->length-i,
6513 str1->str, str1->length,
6514 i);
6515 if (i == -1)
6516 break;
6517 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6518 i += str1->length;
6519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522
6523 Py_ssize_t n, i, j, e;
6524 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 Py_UNICODE *p;
6526
6527 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006528 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6529 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 if (n == 0)
6531 goto nothing;
6532 /* new_size = self->length + n * (str2->length - str1->length)); */
6533 delta = (str2->length - str1->length);
6534 if (delta == 0) {
6535 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 product = n * (str2->length - str1->length);
6538 if ((product / (str2->length - str1->length)) != n) {
6539 PyErr_SetString(PyExc_OverflowError,
6540 "replace string is too long");
6541 return NULL;
6542 }
6543 new_size = self->length + product;
6544 if (new_size < 0) {
6545 PyErr_SetString(PyExc_OverflowError,
6546 "replace string is too long");
6547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 }
6549 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006550 u = _PyUnicode_New(new_size);
6551 if (!u)
6552 return NULL;
6553 i = 0;
6554 p = u->str;
6555 e = self->length - str1->length;
6556 if (str1->length > 0) {
6557 while (n-- > 0) {
6558 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006559 j = stringlib_find(self->str+i, self->length-i,
6560 str1->str, str1->length,
6561 i);
6562 if (j == -1)
6563 break;
6564 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 /* copy unchanged part [i:j] */
6566 Py_UNICODE_COPY(p, self->str+i, j-i);
6567 p += j - i;
6568 }
6569 /* copy substitution string */
6570 if (str2->length > 0) {
6571 Py_UNICODE_COPY(p, str2->str, str2->length);
6572 p += str2->length;
6573 }
6574 i = j + str1->length;
6575 }
6576 if (i < self->length)
6577 /* copy tail [i:] */
6578 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6579 } else {
6580 /* interleave */
6581 while (n > 0) {
6582 Py_UNICODE_COPY(p, str2->str, str2->length);
6583 p += str2->length;
6584 if (--n <= 0)
6585 break;
6586 *p++ = self->str[i++];
6587 }
6588 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006594 /* nothing to replace; return original string (when possible) */
6595 if (PyUnicode_CheckExact(self)) {
6596 Py_INCREF(self);
6597 return (PyObject *) self;
6598 }
6599 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600}
6601
6602/* --- Unicode Object Methods --------------------------------------------- */
6603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006604PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606\n\
6607Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006608characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006611unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 return fixup(self, fixtitle);
6614}
6615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618\n\
6619Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621
6622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006623unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 return fixup(self, fixcapitalize);
6626}
6627
6628#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631\n\
6632Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
6635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006636unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637{
6638 PyObject *list;
6639 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 /* Split into words */
6643 list = split(self, NULL, -1);
6644 if (!list)
6645 return NULL;
6646
6647 /* Capitalize each word */
6648 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6649 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 if (item == NULL)
6652 goto onError;
6653 Py_DECREF(PyList_GET_ITEM(list, i));
6654 PyList_SET_ITEM(list, i, item);
6655 }
6656
6657 /* Join the words to form a new string */
6658 item = PyUnicode_Join(NULL, list);
6659
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 Py_DECREF(list);
6662 return (PyObject *)item;
6663}
6664#endif
6665
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006666/* Argument converter. Coerces to a single unicode character */
6667
6668static int
6669convert_uc(PyObject *obj, void *addr)
6670{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006671 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6672 PyObject *uniobj;
6673 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006674
Benjamin Peterson14339b62009-01-31 16:36:08 +00006675 uniobj = PyUnicode_FromObject(obj);
6676 if (uniobj == NULL) {
6677 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006679 return 0;
6680 }
6681 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6682 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006684 Py_DECREF(uniobj);
6685 return 0;
6686 }
6687 unistr = PyUnicode_AS_UNICODE(uniobj);
6688 *fillcharloc = unistr[0];
6689 Py_DECREF(uniobj);
6690 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006691}
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006696Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006697done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699static PyObject *
6700unicode_center(PyUnicodeObject *self, PyObject *args)
6701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 Py_ssize_t marg, left;
6703 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006704 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
Thomas Woutersde017742006-02-16 19:34:37 +00006706 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 return NULL;
6708
Tim Peters7a29bd52001-09-12 03:03:31 +00006709 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 Py_INCREF(self);
6711 return (PyObject*) self;
6712 }
6713
6714 marg = width - self->length;
6715 left = marg / 2 + (marg & width & 1);
6716
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006717 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
Marc-André Lemburge5034372000-08-08 08:04:29 +00006720#if 0
6721
6722/* This code should go into some future Unicode collation support
6723 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006724 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006725
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006726/* speedy UTF-16 code point order comparison */
6727/* gleaned from: */
6728/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6729
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006730static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006731{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006732 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006733 0, 0, 0, 0, 0, 0, 0, 0,
6734 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006735 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006736};
6737
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738static int
6739unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6740{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006741 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 Py_UNICODE *s1 = str1->str;
6744 Py_UNICODE *s2 = str2->str;
6745
6746 len1 = str1->length;
6747 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006750 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006751
6752 c1 = *s1++;
6753 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 if (c1 > (1<<11) * 26)
6756 c1 += utf16Fixup[c1>>11];
6757 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006758 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006759 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006760
6761 if (c1 != c2)
6762 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006763
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006764 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
6766
6767 return (len1 < len2) ? -1 : (len1 != len2);
6768}
6769
Marc-André Lemburge5034372000-08-08 08:04:29 +00006770#else
6771
6772static int
6773unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6774{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006775 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006776
6777 Py_UNICODE *s1 = str1->str;
6778 Py_UNICODE *s2 = str2->str;
6779
6780 len1 = str1->length;
6781 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006782
Marc-André Lemburge5034372000-08-08 08:04:29 +00006783 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006784 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006785
Fredrik Lundh45714e92001-06-26 16:39:36 +00006786 c1 = *s1++;
6787 c2 = *s2++;
6788
6789 if (c1 != c2)
6790 return (c1 < c2) ? -1 : 1;
6791
Marc-André Lemburge5034372000-08-08 08:04:29 +00006792 len1--; len2--;
6793 }
6794
6795 return (len1 < len2) ? -1 : (len1 != len2);
6796}
6797
6798#endif
6799
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006803 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6804 return unicode_compare((PyUnicodeObject *)left,
6805 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006806 PyErr_Format(PyExc_TypeError,
6807 "Can't compare %.100s and %.100s",
6808 left->ob_type->tp_name,
6809 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 return -1;
6811}
6812
Martin v. Löwis5b222132007-06-10 09:51:05 +00006813int
6814PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6815{
6816 int i;
6817 Py_UNICODE *id;
6818 assert(PyUnicode_Check(uni));
6819 id = PyUnicode_AS_UNICODE(uni);
6820 /* Compare Unicode string and source character set string */
6821 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 if (id[i] != str[i])
6823 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006824 /* This check keeps Python strings that end in '\0' from comparing equal
6825 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006826 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006828 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006830 return 0;
6831}
6832
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006833
Benjamin Peterson29060642009-01-31 22:14:21 +00006834#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006835 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006836
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006837PyObject *PyUnicode_RichCompare(PyObject *left,
6838 PyObject *right,
6839 int op)
6840{
6841 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006842
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006843 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6844 PyObject *v;
6845 if (((PyUnicodeObject *) left)->length !=
6846 ((PyUnicodeObject *) right)->length) {
6847 if (op == Py_EQ) {
6848 Py_INCREF(Py_False);
6849 return Py_False;
6850 }
6851 if (op == Py_NE) {
6852 Py_INCREF(Py_True);
6853 return Py_True;
6854 }
6855 }
6856 if (left == right)
6857 result = 0;
6858 else
6859 result = unicode_compare((PyUnicodeObject *)left,
6860 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006862 /* Convert the return value to a Boolean */
6863 switch (op) {
6864 case Py_EQ:
6865 v = TEST_COND(result == 0);
6866 break;
6867 case Py_NE:
6868 v = TEST_COND(result != 0);
6869 break;
6870 case Py_LE:
6871 v = TEST_COND(result <= 0);
6872 break;
6873 case Py_GE:
6874 v = TEST_COND(result >= 0);
6875 break;
6876 case Py_LT:
6877 v = TEST_COND(result == -1);
6878 break;
6879 case Py_GT:
6880 v = TEST_COND(result == 1);
6881 break;
6882 default:
6883 PyErr_BadArgument();
6884 return NULL;
6885 }
6886 Py_INCREF(v);
6887 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006889
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006890 Py_INCREF(Py_NotImplemented);
6891 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006892}
6893
Guido van Rossum403d68b2000-03-13 15:55:09 +00006894int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006896{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006897 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006899
6900 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006901 sub = PyUnicode_FromObject(element);
6902 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 PyErr_Format(PyExc_TypeError,
6904 "'in <string>' requires string as left operand, not %s",
6905 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006906 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006907 }
6908
Thomas Wouters477c8d52006-05-27 19:21:47 +00006909 str = PyUnicode_FromObject(container);
6910 if (!str) {
6911 Py_DECREF(sub);
6912 return -1;
6913 }
6914
6915 result = stringlib_contains_obj(str, sub);
6916
6917 Py_DECREF(str);
6918 Py_DECREF(sub);
6919
Guido van Rossum403d68b2000-03-13 15:55:09 +00006920 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006921}
6922
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923/* Concat to string or Unicode object giving a new Unicode object. */
6924
6925PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 PyUnicodeObject *u = NULL, *v = NULL, *w;
6929
6930 /* Coerce the two arguments */
6931 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6932 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6935 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
6938 /* Shortcuts */
6939 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 Py_DECREF(v);
6941 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 }
6943 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 Py_DECREF(u);
6945 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 }
6947
6948 /* Concat the two Unicode strings */
6949 w = _PyUnicode_New(u->length + v->length);
6950 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 Py_UNICODE_COPY(w->str, u->str, u->length);
6953 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6954
6955 Py_DECREF(u);
6956 Py_DECREF(v);
6957 return (PyObject *)w;
6958
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 Py_XDECREF(u);
6961 Py_XDECREF(v);
6962 return NULL;
6963}
6964
Walter Dörwald1ab83302007-05-18 17:15:44 +00006965void
6966PyUnicode_Append(PyObject **pleft, PyObject *right)
6967{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968 PyObject *new;
6969 if (*pleft == NULL)
6970 return;
6971 if (right == NULL || !PyUnicode_Check(*pleft)) {
6972 Py_DECREF(*pleft);
6973 *pleft = NULL;
6974 return;
6975 }
6976 new = PyUnicode_Concat(*pleft, right);
6977 Py_DECREF(*pleft);
6978 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006979}
6980
6981void
6982PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6983{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 PyUnicode_Append(pleft, right);
6985 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006986}
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006991Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006992string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject *
6996unicode_count(PyUnicodeObject *self, PyObject *args)
6997{
6998 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007000 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 PyObject *result;
7002
Guido van Rossumb8872e62000-05-09 14:14:27 +00007003 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 return NULL;
7006
7007 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007008 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007011
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007012 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007013 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015 substring->str, substring->length,
7016 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
7019 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007020
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return result;
7022}
7023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007027Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007028to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007029handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7031'xmlcharrefreplace' as well as any other name registered with\n\
7032codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
7034static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007035unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007037 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 char *encoding = NULL;
7039 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007040 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007041
Benjamin Peterson308d6372009-09-18 21:42:35 +00007042 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7043 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007045 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007046 if (v == NULL)
7047 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007048 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007049 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007050 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007051 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007052 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007053 Py_DECREF(v);
7054 return NULL;
7055 }
7056 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007057
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007059 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007060}
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064\n\
7065Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067
7068static PyObject*
7069unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7070{
7071 Py_UNICODE *e;
7072 Py_UNICODE *p;
7073 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007074 Py_UNICODE *qe;
7075 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 PyUnicodeObject *u;
7077 int tabsize = 8;
7078
7079 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
Thomas Wouters7e474022000-07-16 12:04:32 +00007082 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007083 i = 0; /* chars up to and including most recent \n or \r */
7084 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7085 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 for (p = self->str; p < e; p++)
7087 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 if (tabsize > 0) {
7089 incr = tabsize - (j % tabsize); /* cannot overflow */
7090 if (j > PY_SSIZE_T_MAX - incr)
7091 goto overflow1;
7092 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 if (j > PY_SSIZE_T_MAX - 1)
7097 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 j++;
7099 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 if (i > PY_SSIZE_T_MAX - j)
7101 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007103 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 }
7105 }
7106
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007107 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007109
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 /* Second pass: create output string and fill it */
7111 u = _PyUnicode_New(i + j);
7112 if (!u)
7113 return NULL;
7114
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007115 j = 0; /* same as in first pass */
7116 q = u->str; /* next output char */
7117 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118
7119 for (p = self->str; p < e; p++)
7120 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 if (tabsize > 0) {
7122 i = tabsize - (j % tabsize);
7123 j += i;
7124 while (i--) {
7125 if (q >= qe)
7126 goto overflow2;
7127 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 else {
7132 if (q >= qe)
7133 goto overflow2;
7134 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007135 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 if (*p == '\n' || *p == '\r')
7137 j = 0;
7138 }
7139
7140 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007141
7142 overflow2:
7143 Py_DECREF(u);
7144 overflow1:
7145 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147}
7148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007149PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151\n\
7152Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007153such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154arguments start and end are interpreted as in slice notation.\n\
7155\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158static PyObject *
7159unicode_find(PyUnicodeObject *self, PyObject *args)
7160{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007162 Py_ssize_t start;
7163 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
Christian Heimes9cd17752007-11-18 19:35:23 +00007166 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
Thomas Wouters477c8d52006-05-27 19:21:47 +00007169 result = stringlib_find_slice(
7170 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7171 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7172 start, end
7173 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174
7175 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007176
Christian Heimes217cfd12007-12-02 14:31:20 +00007177 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
7180static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007181unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
7183 if (index < 0 || index >= self->length) {
7184 PyErr_SetString(PyExc_IndexError, "string index out of range");
7185 return NULL;
7186 }
7187
7188 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7189}
7190
Guido van Rossumc2504932007-09-18 19:42:40 +00007191/* Believe it or not, this produces the same value for ASCII strings
7192 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007194unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195{
Guido van Rossumc2504932007-09-18 19:42:40 +00007196 Py_ssize_t len;
7197 Py_UNICODE *p;
7198 long x;
7199
7200 if (self->hash != -1)
7201 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007202 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007203 p = self->str;
7204 x = *p << 7;
7205 while (--len >= 0)
7206 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007207 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007208 if (x == -1)
7209 x = -2;
7210 self->hash = x;
7211 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212}
7213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007214PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007217Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
7219static PyObject *
7220unicode_index(PyUnicodeObject *self, PyObject *args)
7221{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007223 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007224 Py_ssize_t start;
7225 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Christian Heimes9cd17752007-11-18 19:35:23 +00007227 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229
Thomas Wouters477c8d52006-05-27 19:21:47 +00007230 result = stringlib_find_slice(
7231 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7232 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7233 start, end
7234 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007237
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 if (result < 0) {
7239 PyErr_SetString(PyExc_ValueError, "substring not found");
7240 return NULL;
7241 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007242
Christian Heimes217cfd12007-12-02 14:31:20 +00007243 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244}
7245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007246PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007249Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007250at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007253unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7256 register const Py_UNICODE *e;
7257 int cased;
7258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 /* Shortcut for single character strings */
7260 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007263 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007264 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007266
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 e = p + PyUnicode_GET_SIZE(self);
7268 cased = 0;
7269 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7273 return PyBool_FromLong(0);
7274 else if (!cased && Py_UNICODE_ISLOWER(ch))
7275 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007277 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278}
7279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007283Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007284at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007287unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
7289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7290 register const Py_UNICODE *e;
7291 int cased;
7292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 /* Shortcut for single character strings */
7294 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007297 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007298 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 e = p + PyUnicode_GET_SIZE(self);
7302 cased = 0;
7303 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007305
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7307 return PyBool_FromLong(0);
7308 else if (!cased && Py_UNICODE_ISUPPER(ch))
7309 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007311 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312}
7313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007314PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007317Return True if S is a titlecased string and there is at least one\n\
7318character in S, i.e. upper- and titlecase characters may only\n\
7319follow uncased characters and lowercase characters only cased ones.\n\
7320Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321
7322static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007323unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324{
7325 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7326 register const Py_UNICODE *e;
7327 int cased, previous_is_cased;
7328
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 /* Shortcut for single character strings */
7330 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7332 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007334 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007335 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 e = p + PyUnicode_GET_SIZE(self);
7339 cased = 0;
7340 previous_is_cased = 0;
7341 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007343
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7345 if (previous_is_cased)
7346 return PyBool_FromLong(0);
7347 previous_is_cased = 1;
7348 cased = 1;
7349 }
7350 else if (Py_UNICODE_ISLOWER(ch)) {
7351 if (!previous_is_cased)
7352 return PyBool_FromLong(0);
7353 previous_is_cased = 1;
7354 cased = 1;
7355 }
7356 else
7357 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007359 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360}
7361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007365Return True if all characters in S are whitespace\n\
7366and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007369unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370{
7371 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7372 register const Py_UNICODE *e;
7373
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 /* Shortcut for single character strings */
7375 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 Py_UNICODE_ISSPACE(*p))
7377 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007379 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007380 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007382
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 e = p + PyUnicode_GET_SIZE(self);
7384 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 if (!Py_UNICODE_ISSPACE(*p))
7386 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007388 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007391PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007393\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007394Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007396
7397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007398unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007399{
7400 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7401 register const Py_UNICODE *e;
7402
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007403 /* Shortcut for single character strings */
7404 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 Py_UNICODE_ISALPHA(*p))
7406 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007407
7408 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007409 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007411
7412 e = p + PyUnicode_GET_SIZE(self);
7413 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 if (!Py_UNICODE_ISALPHA(*p))
7415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007417 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007418}
7419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007420PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007422\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007423Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007425
7426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007427unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007428{
7429 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7430 register const Py_UNICODE *e;
7431
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007432 /* Shortcut for single character strings */
7433 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 Py_UNICODE_ISALNUM(*p))
7435 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007436
7437 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007438 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007440
7441 e = p + PyUnicode_GET_SIZE(self);
7442 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 if (!Py_UNICODE_ISALNUM(*p))
7444 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007445 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007446 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007447}
7448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007449PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007452Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007453False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
7455static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007456unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457{
7458 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7459 register const Py_UNICODE *e;
7460
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 /* Shortcut for single character strings */
7462 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 Py_UNICODE_ISDECIMAL(*p))
7464 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007466 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007467 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007469
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 e = p + PyUnicode_GET_SIZE(self);
7471 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 if (!Py_UNICODE_ISDECIMAL(*p))
7473 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007475 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476}
7477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007481Return True if all characters in S are digits\n\
7482and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
7484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007485unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486{
7487 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7488 register const Py_UNICODE *e;
7489
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 /* Shortcut for single character strings */
7491 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 Py_UNICODE_ISDIGIT(*p))
7493 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007495 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007496 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007498
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 e = p + PyUnicode_GET_SIZE(self);
7500 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 if (!Py_UNICODE_ISDIGIT(*p))
7502 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007504 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505}
7506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007510Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007511False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007514unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515{
7516 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7517 register const Py_UNICODE *e;
7518
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 /* Shortcut for single character strings */
7520 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 Py_UNICODE_ISNUMERIC(*p))
7522 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007524 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007525 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007527
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 e = p + PyUnicode_GET_SIZE(self);
7529 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 if (!Py_UNICODE_ISNUMERIC(*p))
7531 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007533 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534}
7535
Martin v. Löwis47383402007-08-15 07:32:56 +00007536int
7537PyUnicode_IsIdentifier(PyObject *self)
7538{
7539 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7540 register const Py_UNICODE *e;
7541
7542 /* Special case for empty strings */
7543 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007545
7546 /* PEP 3131 says that the first character must be in
7547 XID_Start and subsequent characters in XID_Continue,
7548 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007549 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007550 letters, digits, underscore). However, given the current
7551 definition of XID_Start and XID_Continue, it is sufficient
7552 to check just for these, except that _ must be allowed
7553 as starting an identifier. */
7554 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7555 return 0;
7556
7557 e = p + PyUnicode_GET_SIZE(self);
7558 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 if (!_PyUnicode_IsXidContinue(*p))
7560 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007561 }
7562 return 1;
7563}
7564
7565PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007567\n\
7568Return True if S is a valid identifier according\n\
7569to the language definition.");
7570
7571static PyObject*
7572unicode_isidentifier(PyObject *self)
7573{
7574 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7575}
7576
Georg Brandl559e5d72008-06-11 18:37:52 +00007577PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007579\n\
7580Return True if all characters in S are considered\n\
7581printable in repr() or S is empty, False otherwise.");
7582
7583static PyObject*
7584unicode_isprintable(PyObject *self)
7585{
7586 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7587 register const Py_UNICODE *e;
7588
7589 /* Shortcut for single character strings */
7590 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7591 Py_RETURN_TRUE;
7592 }
7593
7594 e = p + PyUnicode_GET_SIZE(self);
7595 for (; p < e; p++) {
7596 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7597 Py_RETURN_FALSE;
7598 }
7599 }
7600 Py_RETURN_TRUE;
7601}
7602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007603PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007604 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605\n\
7606Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007607iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
7609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007610unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007612 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616unicode_length(PyUnicodeObject *self)
7617{
7618 return self->length;
7619}
7620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007624Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007625done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject *
7628unicode_ljust(PyUnicodeObject *self, PyObject *args)
7629{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007630 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007631 Py_UNICODE fillchar = ' ';
7632
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007633 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 return NULL;
7635
Tim Peters7a29bd52001-09-12 03:03:31 +00007636 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 Py_INCREF(self);
7638 return (PyObject*) self;
7639 }
7640
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007641 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648
7649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007650unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 return fixup(self, fixlower);
7653}
7654
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007655#define LEFTSTRIP 0
7656#define RIGHTSTRIP 1
7657#define BOTHSTRIP 2
7658
7659/* Arrays indexed by above */
7660static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7661
7662#define STRIPNAME(i) (stripformat[i]+3)
7663
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664/* externally visible for str.strip(unicode) */
7665PyObject *
7666_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7667{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7669 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7670 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7671 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7672 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007673
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007675
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 i = 0;
7677 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7679 i++;
7680 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007681 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007682
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 j = len;
7684 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 do {
7686 j--;
7687 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7688 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007689 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007690
Benjamin Peterson14339b62009-01-31 16:36:08 +00007691 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 Py_INCREF(self);
7693 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007694 }
7695 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697}
7698
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
7700static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007701do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007703 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7704 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705
Benjamin Peterson14339b62009-01-31 16:36:08 +00007706 i = 0;
7707 if (striptype != RIGHTSTRIP) {
7708 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7709 i++;
7710 }
7711 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007712
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 j = len;
7714 if (striptype != LEFTSTRIP) {
7715 do {
7716 j--;
7717 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7718 j++;
7719 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007720
Benjamin Peterson14339b62009-01-31 16:36:08 +00007721 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7722 Py_INCREF(self);
7723 return (PyObject*)self;
7724 }
7725 else
7726 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727}
7728
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729
7730static PyObject *
7731do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7732{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007734
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7736 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007737
Benjamin Peterson14339b62009-01-31 16:36:08 +00007738 if (sep != NULL && sep != Py_None) {
7739 if (PyUnicode_Check(sep))
7740 return _PyUnicode_XStrip(self, striptype, sep);
7741 else {
7742 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 "%s arg must be None or str",
7744 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007745 return NULL;
7746 }
7747 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007748
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007750}
7751
7752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007753PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007755\n\
7756Return a copy of the string S with leading and trailing\n\
7757whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007758If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759
7760static PyObject *
7761unicode_strip(PyUnicodeObject *self, PyObject *args)
7762{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 if (PyTuple_GET_SIZE(args) == 0)
7764 return do_strip(self, BOTHSTRIP); /* Common case */
7765 else
7766 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007767}
7768
7769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007770PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007772\n\
7773Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007774If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775
7776static PyObject *
7777unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 if (PyTuple_GET_SIZE(args) == 0)
7780 return do_strip(self, LEFTSTRIP); /* Common case */
7781 else
7782 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783}
7784
7785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007788\n\
7789Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007790If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791
7792static PyObject *
7793unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7794{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007795 if (PyTuple_GET_SIZE(args) == 0)
7796 return do_strip(self, RIGHTSTRIP); /* Common case */
7797 else
7798 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007799}
7800
7801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007803unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804{
7805 PyUnicodeObject *u;
7806 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007808 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
Georg Brandl222de0f2009-04-12 12:01:50 +00007810 if (len < 1) {
7811 Py_INCREF(unicode_empty);
7812 return (PyObject *)unicode_empty;
7813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
Tim Peters7a29bd52001-09-12 03:03:31 +00007815 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 /* no repeat, return original string */
7817 Py_INCREF(str);
7818 return (PyObject*) str;
7819 }
Tim Peters8f422462000-09-09 06:13:41 +00007820
7821 /* ensure # of chars needed doesn't overflow int and # of bytes
7822 * needed doesn't overflow size_t
7823 */
7824 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007825 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007826 PyErr_SetString(PyExc_OverflowError,
7827 "repeated string is too long");
7828 return NULL;
7829 }
7830 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7831 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7832 PyErr_SetString(PyExc_OverflowError,
7833 "repeated string is too long");
7834 return NULL;
7835 }
7836 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 if (!u)
7838 return NULL;
7839
7840 p = u->str;
7841
Georg Brandl222de0f2009-04-12 12:01:50 +00007842 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007843 Py_UNICODE_FILL(p, str->str[0], len);
7844 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007845 Py_ssize_t done = str->length; /* number of characters copied this far */
7846 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007848 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007849 Py_UNICODE_COPY(p+done, p, n);
7850 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 }
7853
7854 return (PyObject*) u;
7855}
7856
7857PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 PyObject *subobj,
7859 PyObject *replobj,
7860 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
7862 PyObject *self;
7863 PyObject *str1;
7864 PyObject *str2;
7865 PyObject *result;
7866
7867 self = PyUnicode_FromObject(obj);
7868 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 str1 = PyUnicode_FromObject(subobj);
7871 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 Py_DECREF(self);
7873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 }
7875 str2 = PyUnicode_FromObject(replobj);
7876 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 Py_DECREF(self);
7878 Py_DECREF(str1);
7879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 }
Tim Petersced69f82003-09-16 20:30:58 +00007881 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 (PyUnicodeObject *)str1,
7883 (PyUnicodeObject *)str2,
7884 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 Py_DECREF(self);
7886 Py_DECREF(str1);
7887 Py_DECREF(str2);
7888 return result;
7889}
7890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893\n\
7894Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007895old replaced by new. If the optional argument count is\n\
7896given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
7898static PyObject*
7899unicode_replace(PyUnicodeObject *self, PyObject *args)
7900{
7901 PyUnicodeObject *str1;
7902 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007903 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 PyObject *result;
7905
Martin v. Löwis18e16552006-02-15 17:27:45 +00007906 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 return NULL;
7908 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7909 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007912 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 Py_DECREF(str1);
7914 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916
7917 result = replace(self, str1, str2, maxcount);
7918
7919 Py_DECREF(str1);
7920 Py_DECREF(str2);
7921 return result;
7922}
7923
7924static
7925PyObject *unicode_repr(PyObject *unicode)
7926{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007927 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007928 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007929 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7930 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7931
7932 /* XXX(nnorwitz): rather than over-allocating, it would be
7933 better to choose a different scheme. Perhaps scan the
7934 first N-chars of the string and allocate based on that size.
7935 */
7936 /* Initial allocation is based on the longest-possible unichr
7937 escape.
7938
7939 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7940 unichr, so in this case it's the longest unichr escape. In
7941 narrow (UTF-16) builds this is five chars per source unichr
7942 since there are two unichrs in the surrogate pair, so in narrow
7943 (UTF-16) builds it's not the longest unichr escape.
7944
7945 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7946 so in the narrow (UTF-16) build case it's the longest unichr
7947 escape.
7948 */
7949
Walter Dörwald1ab83302007-05-18 17:15:44 +00007950 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007952#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007954#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007956#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007958 if (repr == NULL)
7959 return NULL;
7960
Walter Dörwald1ab83302007-05-18 17:15:44 +00007961 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007962
7963 /* Add quote */
7964 *p++ = (findchar(s, size, '\'') &&
7965 !findchar(s, size, '"')) ? '"' : '\'';
7966 while (size-- > 0) {
7967 Py_UNICODE ch = *s++;
7968
7969 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007970 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007971 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007972 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007973 continue;
7974 }
7975
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007977 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007978 *p++ = '\\';
7979 *p++ = 't';
7980 }
7981 else if (ch == '\n') {
7982 *p++ = '\\';
7983 *p++ = 'n';
7984 }
7985 else if (ch == '\r') {
7986 *p++ = '\\';
7987 *p++ = 'r';
7988 }
7989
7990 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007991 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007992 *p++ = '\\';
7993 *p++ = 'x';
7994 *p++ = hexdigits[(ch >> 4) & 0x000F];
7995 *p++ = hexdigits[ch & 0x000F];
7996 }
7997
Georg Brandl559e5d72008-06-11 18:37:52 +00007998 /* Copy ASCII characters as-is */
7999 else if (ch < 0x7F) {
8000 *p++ = ch;
8001 }
8002
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008004 else {
8005 Py_UCS4 ucs = ch;
8006
8007#ifndef Py_UNICODE_WIDE
8008 Py_UNICODE ch2 = 0;
8009 /* Get code point from surrogate pair */
8010 if (size > 0) {
8011 ch2 = *s;
8012 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008017 size--;
8018 }
8019 }
8020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008022 (categories Z* and C* except ASCII space)
8023 */
8024 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8025 /* Map 8-bit characters to '\xhh' */
8026 if (ucs <= 0xff) {
8027 *p++ = '\\';
8028 *p++ = 'x';
8029 *p++ = hexdigits[(ch >> 4) & 0x000F];
8030 *p++ = hexdigits[ch & 0x000F];
8031 }
8032 /* Map 21-bit characters to '\U00xxxxxx' */
8033 else if (ucs >= 0x10000) {
8034 *p++ = '\\';
8035 *p++ = 'U';
8036 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8037 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8038 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8039 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8040 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8041 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8042 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8043 *p++ = hexdigits[ucs & 0x0000000F];
8044 }
8045 /* Map 16-bit characters to '\uxxxx' */
8046 else {
8047 *p++ = '\\';
8048 *p++ = 'u';
8049 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8050 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8051 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8052 *p++ = hexdigits[ucs & 0x000F];
8053 }
8054 }
8055 /* Copy characters as-is */
8056 else {
8057 *p++ = ch;
8058#ifndef Py_UNICODE_WIDE
8059 if (ucs >= 0x10000)
8060 *p++ = ch2;
8061#endif
8062 }
8063 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008064 }
8065 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008066 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067
8068 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008069 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008070 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071}
8072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008073PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075\n\
8076Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008077such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078arguments start and end are interpreted as in slice notation.\n\
8079\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008080Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
8082static PyObject *
8083unicode_rfind(PyUnicodeObject *self, PyObject *args)
8084{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008086 Py_ssize_t start;
8087 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008088 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089
Christian Heimes9cd17752007-11-18 19:35:23 +00008090 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092
Thomas Wouters477c8d52006-05-27 19:21:47 +00008093 result = stringlib_rfind_slice(
8094 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8095 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8096 start, end
8097 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098
8099 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008100
Christian Heimes217cfd12007-12-02 14:31:20 +00008101 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008104PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008107Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
8109static PyObject *
8110unicode_rindex(PyUnicodeObject *self, PyObject *args)
8111{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008112 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008113 Py_ssize_t start;
8114 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008115 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116
Christian Heimes9cd17752007-11-18 19:35:23 +00008117 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119
Thomas Wouters477c8d52006-05-27 19:21:47 +00008120 result = stringlib_rfind_slice(
8121 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8122 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8123 start, end
8124 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125
8126 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (result < 0) {
8129 PyErr_SetString(PyExc_ValueError, "substring not found");
8130 return NULL;
8131 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008132 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133}
8134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008135PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008138Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008139done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140
8141static PyObject *
8142unicode_rjust(PyUnicodeObject *self, PyObject *args)
8143{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008144 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008145 Py_UNICODE fillchar = ' ';
8146
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008147 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 return NULL;
8149
Tim Peters7a29bd52001-09-12 03:03:31 +00008150 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 Py_INCREF(self);
8152 return (PyObject*) self;
8153 }
8154
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008155 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156}
8157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 PyObject *sep,
8160 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161{
8162 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008163
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164 s = PyUnicode_FromObject(s);
8165 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 if (sep != NULL) {
8168 sep = PyUnicode_FromObject(sep);
8169 if (sep == NULL) {
8170 Py_DECREF(s);
8171 return NULL;
8172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 }
8174
8175 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8176
8177 Py_DECREF(s);
8178 Py_XDECREF(sep);
8179 return result;
8180}
8181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008182PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184\n\
8185Return a list of the words in S, using sep as the\n\
8186delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008187splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008188whitespace string is a separator and empty strings are\n\
8189removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190
8191static PyObject*
8192unicode_split(PyUnicodeObject *self, PyObject *args)
8193{
8194 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008195 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
Martin v. Löwis18e16552006-02-15 17:27:45 +00008197 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 return NULL;
8199
8200 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206}
8207
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208PyObject *
8209PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8210{
8211 PyObject* str_obj;
8212 PyObject* sep_obj;
8213 PyObject* out;
8214
8215 str_obj = PyUnicode_FromObject(str_in);
8216 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008218 sep_obj = PyUnicode_FromObject(sep_in);
8219 if (!sep_obj) {
8220 Py_DECREF(str_obj);
8221 return NULL;
8222 }
8223
8224 out = stringlib_partition(
8225 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8226 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8227 );
8228
8229 Py_DECREF(sep_obj);
8230 Py_DECREF(str_obj);
8231
8232 return out;
8233}
8234
8235
8236PyObject *
8237PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8238{
8239 PyObject* str_obj;
8240 PyObject* sep_obj;
8241 PyObject* out;
8242
8243 str_obj = PyUnicode_FromObject(str_in);
8244 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008246 sep_obj = PyUnicode_FromObject(sep_in);
8247 if (!sep_obj) {
8248 Py_DECREF(str_obj);
8249 return NULL;
8250 }
8251
8252 out = stringlib_rpartition(
8253 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8254 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8255 );
8256
8257 Py_DECREF(sep_obj);
8258 Py_DECREF(str_obj);
8259
8260 return out;
8261}
8262
8263PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008265\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008266Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008267the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008268found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008269
8270static PyObject*
8271unicode_partition(PyUnicodeObject *self, PyObject *separator)
8272{
8273 return PyUnicode_Partition((PyObject *)self, separator);
8274}
8275
8276PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008277 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008278\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008279Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008280the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008281separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008282
8283static PyObject*
8284unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8285{
8286 return PyUnicode_RPartition((PyObject *)self, separator);
8287}
8288
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008289PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 PyObject *sep,
8291 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008292{
8293 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008294
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008295 s = PyUnicode_FromObject(s);
8296 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (sep != NULL) {
8299 sep = PyUnicode_FromObject(sep);
8300 if (sep == NULL) {
8301 Py_DECREF(s);
8302 return NULL;
8303 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008304 }
8305
8306 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8307
8308 Py_DECREF(s);
8309 Py_XDECREF(sep);
8310 return result;
8311}
8312
8313PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008315\n\
8316Return a list of the words in S, using sep as the\n\
8317delimiter string, starting at the end of the string and\n\
8318working to the front. If maxsplit is given, at most maxsplit\n\
8319splits are done. If sep is not specified, any whitespace string\n\
8320is a separator.");
8321
8322static PyObject*
8323unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8324{
8325 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008326 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008327
Martin v. Löwis18e16552006-02-15 17:27:45 +00008328 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008329 return NULL;
8330
8331 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008333 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008335 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008337}
8338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008339PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341\n\
8342Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008343Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008344is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345
8346static PyObject*
8347unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8348{
Guido van Rossum86662912000-04-11 15:38:46 +00008349 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350
Guido van Rossum86662912000-04-11 15:38:46 +00008351 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 return NULL;
8353
Guido van Rossum86662912000-04-11 15:38:46 +00008354 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355}
8356
8357static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008358PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
Walter Dörwald346737f2007-05-31 10:44:43 +00008360 if (PyUnicode_CheckExact(self)) {
8361 Py_INCREF(self);
8362 return self;
8363 } else
8364 /* Subtype -- return genuine unicode string with the same value. */
8365 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8366 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008369PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371\n\
8372Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008373and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374
8375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008376unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 return fixup(self, fixswapcase);
8379}
8380
Georg Brandlceee0772007-11-27 23:48:05 +00008381PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008383\n\
8384Return a translation table usable for str.translate().\n\
8385If there is only one argument, it must be a dictionary mapping Unicode\n\
8386ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008387Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008388If there are two arguments, they must be strings of equal length, and\n\
8389in the resulting dictionary, each character in x will be mapped to the\n\
8390character at the same position in y. If there is a third argument, it\n\
8391must be a string, whose characters will be mapped to None in the result.");
8392
8393static PyObject*
8394unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8395{
8396 PyObject *x, *y = NULL, *z = NULL;
8397 PyObject *new = NULL, *key, *value;
8398 Py_ssize_t i = 0;
8399 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008400
Georg Brandlceee0772007-11-27 23:48:05 +00008401 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8402 return NULL;
8403 new = PyDict_New();
8404 if (!new)
8405 return NULL;
8406 if (y != NULL) {
8407 /* x must be a string too, of equal length */
8408 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8409 if (!PyUnicode_Check(x)) {
8410 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8411 "be a string if there is a second argument");
8412 goto err;
8413 }
8414 if (PyUnicode_GET_SIZE(x) != ylen) {
8415 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8416 "arguments must have equal length");
8417 goto err;
8418 }
8419 /* create entries for translating chars in x to those in y */
8420 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008421 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8422 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008423 if (!key || !value)
8424 goto err;
8425 res = PyDict_SetItem(new, key, value);
8426 Py_DECREF(key);
8427 Py_DECREF(value);
8428 if (res < 0)
8429 goto err;
8430 }
8431 /* create entries for deleting chars in z */
8432 if (z != NULL) {
8433 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008434 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008435 if (!key)
8436 goto err;
8437 res = PyDict_SetItem(new, key, Py_None);
8438 Py_DECREF(key);
8439 if (res < 0)
8440 goto err;
8441 }
8442 }
8443 } else {
8444 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008445 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008446 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8447 "to maketrans it must be a dict");
8448 goto err;
8449 }
8450 /* copy entries into the new dict, converting string keys to int keys */
8451 while (PyDict_Next(x, &i, &key, &value)) {
8452 if (PyUnicode_Check(key)) {
8453 /* convert string keys to integer keys */
8454 PyObject *newkey;
8455 if (PyUnicode_GET_SIZE(key) != 1) {
8456 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8457 "table must be of length 1");
8458 goto err;
8459 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008460 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008461 if (!newkey)
8462 goto err;
8463 res = PyDict_SetItem(new, newkey, value);
8464 Py_DECREF(newkey);
8465 if (res < 0)
8466 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008467 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008468 /* just keep integer keys */
8469 if (PyDict_SetItem(new, key, value) < 0)
8470 goto err;
8471 } else {
8472 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8473 "be strings or integers");
8474 goto err;
8475 }
8476 }
8477 }
8478 return new;
8479 err:
8480 Py_DECREF(new);
8481 return NULL;
8482}
8483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008484PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486\n\
8487Return a copy of the string S, where all characters have been mapped\n\
8488through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008489Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008490Unmapped characters are left untouched. Characters mapped to None\n\
8491are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492
8493static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008494unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495{
Georg Brandlceee0772007-11-27 23:48:05 +00008496 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497}
8498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008499PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008502Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503
8504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008505unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 return fixup(self, fixupper);
8508}
8509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008510PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008513Pad a numeric string S with zeros on the left, to fill a field\n\
8514of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
8516static PyObject *
8517unicode_zfill(PyUnicodeObject *self, PyObject *args)
8518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008519 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 PyUnicodeObject *u;
8521
Martin v. Löwis18e16552006-02-15 17:27:45 +00008522 Py_ssize_t width;
8523 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 return NULL;
8525
8526 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008527 if (PyUnicode_CheckExact(self)) {
8528 Py_INCREF(self);
8529 return (PyObject*) self;
8530 }
8531 else
8532 return PyUnicode_FromUnicode(
8533 PyUnicode_AS_UNICODE(self),
8534 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 }
8537
8538 fill = width - self->length;
8539
8540 u = pad(self, fill, 0, '0');
8541
Walter Dörwald068325e2002-04-15 13:36:47 +00008542 if (u == NULL)
8543 return NULL;
8544
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 if (u->str[fill] == '+' || u->str[fill] == '-') {
8546 /* move sign to beginning of string */
8547 u->str[0] = u->str[fill];
8548 u->str[fill] = '0';
8549 }
8550
8551 return (PyObject*) u;
8552}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
8554#if 0
8555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008556unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557{
Christian Heimes2202f872008-02-06 14:31:34 +00008558 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559}
8560#endif
8561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008562PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008565Return True if S starts with the specified prefix, False otherwise.\n\
8566With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567With optional end, stop comparing S at that position.\n\
8568prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
8570static PyObject *
8571unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008574 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008576 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008577 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008578 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008580 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8582 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008583 if (PyTuple_Check(subobj)) {
8584 Py_ssize_t i;
8585 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008588 if (substring == NULL)
8589 return NULL;
8590 result = tailmatch(self, substring, start, end, -1);
8591 Py_DECREF(substring);
8592 if (result) {
8593 Py_RETURN_TRUE;
8594 }
8595 }
8596 /* nothing matched */
8597 Py_RETURN_FALSE;
8598 }
8599 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008602 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008604 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605}
8606
8607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008608PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008611Return True if S ends with the specified suffix, False otherwise.\n\
8612With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008613With optional end, stop comparing S at that position.\n\
8614suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
8616static PyObject *
8617unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008620 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008622 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008623 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008624 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008626 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8628 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008629 if (PyTuple_Check(subobj)) {
8630 Py_ssize_t i;
8631 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8632 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008634 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008636 result = tailmatch(self, substring, start, end, +1);
8637 Py_DECREF(substring);
8638 if (result) {
8639 Py_RETURN_TRUE;
8640 }
8641 }
8642 Py_RETURN_FALSE;
8643 }
8644 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008648 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008650 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651}
8652
Eric Smith8c663262007-08-25 02:26:07 +00008653#include "stringlib/string_format.h"
8654
8655PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008657\n\
8658");
8659
Eric Smith4a7d76d2008-05-30 18:10:19 +00008660static PyObject *
8661unicode__format__(PyObject* self, PyObject* args)
8662{
8663 PyObject *format_spec;
8664
8665 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8666 return NULL;
8667
8668 return _PyUnicode_FormatAdvanced(self,
8669 PyUnicode_AS_UNICODE(format_spec),
8670 PyUnicode_GET_SIZE(format_spec));
8671}
8672
Eric Smith8c663262007-08-25 02:26:07 +00008673PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008675\n\
8676");
8677
8678static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008679unicode__sizeof__(PyUnicodeObject *v)
8680{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008681 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8682 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008683}
8684
8685PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008687
8688static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008689unicode_getnewargs(PyUnicodeObject *v)
8690{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008691 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008692}
8693
8694
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695static PyMethodDef unicode_methods[] = {
8696
8697 /* Order is according to common usage: often used methods should
8698 appear first, since lookup is done sequentially. */
8699
Benjamin Peterson308d6372009-09-18 21:42:35 +00008700 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008701 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8702 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008703 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008704 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8705 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8706 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8707 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8708 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8709 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8710 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008711 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008712 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8713 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8714 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008715 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008716 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8717 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8718 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008719 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008720 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008721 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008722 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008723 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8724 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8725 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8726 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8727 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8728 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8729 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8730 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8731 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8732 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8733 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8734 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8735 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8736 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008737 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008738 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008739 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008740 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008741 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008742 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8743 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008744 {"maketrans", (PyCFunction) unicode_maketrans,
8745 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008746 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008747#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008748 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749#endif
8750
8751#if 0
8752 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008753 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754#endif
8755
Benjamin Peterson14339b62009-01-31 16:36:08 +00008756 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 {NULL, NULL}
8758};
8759
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008760static PyObject *
8761unicode_mod(PyObject *v, PyObject *w)
8762{
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 if (!PyUnicode_Check(v)) {
8764 Py_INCREF(Py_NotImplemented);
8765 return Py_NotImplemented;
8766 }
8767 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008768}
8769
8770static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771 0, /*nb_add*/
8772 0, /*nb_subtract*/
8773 0, /*nb_multiply*/
8774 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008775};
8776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008778 (lenfunc) unicode_length, /* sq_length */
8779 PyUnicode_Concat, /* sq_concat */
8780 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8781 (ssizeargfunc) unicode_getitem, /* sq_item */
8782 0, /* sq_slice */
8783 0, /* sq_ass_item */
8784 0, /* sq_ass_slice */
8785 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786};
8787
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008788static PyObject*
8789unicode_subscript(PyUnicodeObject* self, PyObject* item)
8790{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008791 if (PyIndex_Check(item)) {
8792 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008793 if (i == -1 && PyErr_Occurred())
8794 return NULL;
8795 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008796 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008797 return unicode_getitem(self, i);
8798 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008799 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008800 Py_UNICODE* source_buf;
8801 Py_UNICODE* result_buf;
8802 PyObject* result;
8803
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008804 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008806 return NULL;
8807 }
8808
8809 if (slicelength <= 0) {
8810 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008811 } else if (start == 0 && step == 1 && slicelength == self->length &&
8812 PyUnicode_CheckExact(self)) {
8813 Py_INCREF(self);
8814 return (PyObject *)self;
8815 } else if (step == 1) {
8816 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008817 } else {
8818 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008819 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8820 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008821
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 if (result_buf == NULL)
8823 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008824
8825 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8826 result_buf[i] = source_buf[cur];
8827 }
Tim Petersced69f82003-09-16 20:30:58 +00008828
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008829 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008830 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008831 return result;
8832 }
8833 } else {
8834 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8835 return NULL;
8836 }
8837}
8838
8839static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 (lenfunc)unicode_length, /* mp_length */
8841 (binaryfunc)unicode_subscript, /* mp_subscript */
8842 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008843};
8844
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846/* Helpers for PyUnicode_Format() */
8847
8848static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008849getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008851 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 (*p_argidx)++;
8854 if (arglen < 0)
8855 return args;
8856 else
8857 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 }
8859 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 return NULL;
8862}
8863
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008864/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008866static PyObject *
8867formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008869 char *p;
8870 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 x = PyFloat_AsDouble(v);
8874 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008875 return NULL;
8876
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008879
Eric Smith0923d1d2009-04-16 20:16:10 +00008880 p = PyOS_double_to_string(x, type, prec,
8881 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008882 if (p == NULL)
8883 return NULL;
8884 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008885 PyMem_Free(p);
8886 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887}
8888
Tim Peters38fd5b62000-09-21 05:43:11 +00008889static PyObject*
8890formatlong(PyObject *val, int flags, int prec, int type)
8891{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 char *buf;
8893 int len;
8894 PyObject *str; /* temporary string object. */
8895 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008896
Benjamin Peterson14339b62009-01-31 16:36:08 +00008897 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8898 if (!str)
8899 return NULL;
8900 result = PyUnicode_FromStringAndSize(buf, len);
8901 Py_DECREF(str);
8902 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008903}
8904
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905static int
8906formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008907 size_t buflen,
8908 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008910 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008911 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 if (PyUnicode_GET_SIZE(v) == 1) {
8913 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8914 buf[1] = '\0';
8915 return 1;
8916 }
8917#ifndef Py_UNICODE_WIDE
8918 if (PyUnicode_GET_SIZE(v) == 2) {
8919 /* Decode a valid surrogate pair */
8920 int c0 = PyUnicode_AS_UNICODE(v)[0];
8921 int c1 = PyUnicode_AS_UNICODE(v)[1];
8922 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8923 0xDC00 <= c1 && c1 <= 0xDFFF) {
8924 buf[0] = c0;
8925 buf[1] = c1;
8926 buf[2] = '\0';
8927 return 2;
8928 }
8929 }
8930#endif
8931 goto onError;
8932 }
8933 else {
8934 /* Integer input truncated to a character */
8935 long x;
8936 x = PyLong_AsLong(v);
8937 if (x == -1 && PyErr_Occurred())
8938 goto onError;
8939
8940 if (x < 0 || x > 0x10ffff) {
8941 PyErr_SetString(PyExc_OverflowError,
8942 "%c arg not in range(0x110000)");
8943 return -1;
8944 }
8945
8946#ifndef Py_UNICODE_WIDE
8947 if (x > 0xffff) {
8948 x -= 0x10000;
8949 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8950 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8951 return 2;
8952 }
8953#endif
8954 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008955 buf[1] = '\0';
8956 return 1;
8957 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008958
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008960 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008962 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963}
8964
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008965/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008966 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008967*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008968#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008969
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
8973 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008974 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 int args_owned = 0;
8976 PyUnicodeObject *result = NULL;
8977 PyObject *dict = NULL;
8978 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008979
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 PyErr_BadInternalCall();
8982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
8984 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008985 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 fmt = PyUnicode_AS_UNICODE(uformat);
8988 fmtcnt = PyUnicode_GET_SIZE(uformat);
8989
8990 reslen = rescnt = fmtcnt + 100;
8991 result = _PyUnicode_New(reslen);
8992 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 res = PyUnicode_AS_UNICODE(result);
8995
8996 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 arglen = PyTuple_Size(args);
8998 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 }
9000 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 arglen = -1;
9002 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009004 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009005 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007
9008 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 if (*fmt != '%') {
9010 if (--rescnt < 0) {
9011 rescnt = fmtcnt + 100;
9012 reslen += rescnt;
9013 if (_PyUnicode_Resize(&result, reslen) < 0)
9014 goto onError;
9015 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9016 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009017 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009019 }
9020 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 /* Got a format specifier */
9022 int flags = 0;
9023 Py_ssize_t width = -1;
9024 int prec = -1;
9025 Py_UNICODE c = '\0';
9026 Py_UNICODE fill;
9027 int isnumok;
9028 PyObject *v = NULL;
9029 PyObject *temp = NULL;
9030 Py_UNICODE *pbuf;
9031 Py_UNICODE sign;
9032 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009033 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 fmt++;
9036 if (*fmt == '(') {
9037 Py_UNICODE *keystart;
9038 Py_ssize_t keylen;
9039 PyObject *key;
9040 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009041
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 if (dict == NULL) {
9043 PyErr_SetString(PyExc_TypeError,
9044 "format requires a mapping");
9045 goto onError;
9046 }
9047 ++fmt;
9048 --fmtcnt;
9049 keystart = fmt;
9050 /* Skip over balanced parentheses */
9051 while (pcount > 0 && --fmtcnt >= 0) {
9052 if (*fmt == ')')
9053 --pcount;
9054 else if (*fmt == '(')
9055 ++pcount;
9056 fmt++;
9057 }
9058 keylen = fmt - keystart - 1;
9059 if (fmtcnt < 0 || pcount > 0) {
9060 PyErr_SetString(PyExc_ValueError,
9061 "incomplete format key");
9062 goto onError;
9063 }
9064#if 0
9065 /* keys are converted to strings using UTF-8 and
9066 then looked up since Python uses strings to hold
9067 variables names etc. in its namespaces and we
9068 wouldn't want to break common idioms. */
9069 key = PyUnicode_EncodeUTF8(keystart,
9070 keylen,
9071 NULL);
9072#else
9073 key = PyUnicode_FromUnicode(keystart, keylen);
9074#endif
9075 if (key == NULL)
9076 goto onError;
9077 if (args_owned) {
9078 Py_DECREF(args);
9079 args_owned = 0;
9080 }
9081 args = PyObject_GetItem(dict, key);
9082 Py_DECREF(key);
9083 if (args == NULL) {
9084 goto onError;
9085 }
9086 args_owned = 1;
9087 arglen = -1;
9088 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009089 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 while (--fmtcnt >= 0) {
9091 switch (c = *fmt++) {
9092 case '-': flags |= F_LJUST; continue;
9093 case '+': flags |= F_SIGN; continue;
9094 case ' ': flags |= F_BLANK; continue;
9095 case '#': flags |= F_ALT; continue;
9096 case '0': flags |= F_ZERO; continue;
9097 }
9098 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 if (c == '*') {
9101 v = getnextarg(args, arglen, &argidx);
9102 if (v == NULL)
9103 goto onError;
9104 if (!PyLong_Check(v)) {
9105 PyErr_SetString(PyExc_TypeError,
9106 "* wants int");
9107 goto onError;
9108 }
9109 width = PyLong_AsLong(v);
9110 if (width == -1 && PyErr_Occurred())
9111 goto onError;
9112 if (width < 0) {
9113 flags |= F_LJUST;
9114 width = -width;
9115 }
9116 if (--fmtcnt >= 0)
9117 c = *fmt++;
9118 }
9119 else if (c >= '0' && c <= '9') {
9120 width = c - '0';
9121 while (--fmtcnt >= 0) {
9122 c = *fmt++;
9123 if (c < '0' || c > '9')
9124 break;
9125 if ((width*10) / 10 != width) {
9126 PyErr_SetString(PyExc_ValueError,
9127 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009128 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 }
9130 width = width*10 + (c - '0');
9131 }
9132 }
9133 if (c == '.') {
9134 prec = 0;
9135 if (--fmtcnt >= 0)
9136 c = *fmt++;
9137 if (c == '*') {
9138 v = getnextarg(args, arglen, &argidx);
9139 if (v == NULL)
9140 goto onError;
9141 if (!PyLong_Check(v)) {
9142 PyErr_SetString(PyExc_TypeError,
9143 "* wants int");
9144 goto onError;
9145 }
9146 prec = PyLong_AsLong(v);
9147 if (prec == -1 && PyErr_Occurred())
9148 goto onError;
9149 if (prec < 0)
9150 prec = 0;
9151 if (--fmtcnt >= 0)
9152 c = *fmt++;
9153 }
9154 else if (c >= '0' && c <= '9') {
9155 prec = c - '0';
9156 while (--fmtcnt >= 0) {
9157 c = Py_CHARMASK(*fmt++);
9158 if (c < '0' || c > '9')
9159 break;
9160 if ((prec*10) / 10 != prec) {
9161 PyErr_SetString(PyExc_ValueError,
9162 "prec too big");
9163 goto onError;
9164 }
9165 prec = prec*10 + (c - '0');
9166 }
9167 }
9168 } /* prec */
9169 if (fmtcnt >= 0) {
9170 if (c == 'h' || c == 'l' || c == 'L') {
9171 if (--fmtcnt >= 0)
9172 c = *fmt++;
9173 }
9174 }
9175 if (fmtcnt < 0) {
9176 PyErr_SetString(PyExc_ValueError,
9177 "incomplete format");
9178 goto onError;
9179 }
9180 if (c != '%') {
9181 v = getnextarg(args, arglen, &argidx);
9182 if (v == NULL)
9183 goto onError;
9184 }
9185 sign = 0;
9186 fill = ' ';
9187 switch (c) {
9188
9189 case '%':
9190 pbuf = formatbuf;
9191 /* presume that buffer length is at least 1 */
9192 pbuf[0] = '%';
9193 len = 1;
9194 break;
9195
9196 case 's':
9197 case 'r':
9198 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009199 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 temp = v;
9201 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009202 }
9203 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 if (c == 's')
9205 temp = PyObject_Str(v);
9206 else if (c == 'r')
9207 temp = PyObject_Repr(v);
9208 else
9209 temp = PyObject_ASCII(v);
9210 if (temp == NULL)
9211 goto onError;
9212 if (PyUnicode_Check(temp))
9213 /* nothing to do */;
9214 else {
9215 Py_DECREF(temp);
9216 PyErr_SetString(PyExc_TypeError,
9217 "%s argument has non-string str()");
9218 goto onError;
9219 }
9220 }
9221 pbuf = PyUnicode_AS_UNICODE(temp);
9222 len = PyUnicode_GET_SIZE(temp);
9223 if (prec >= 0 && len > prec)
9224 len = prec;
9225 break;
9226
9227 case 'i':
9228 case 'd':
9229 case 'u':
9230 case 'o':
9231 case 'x':
9232 case 'X':
9233 if (c == 'i')
9234 c = 'd';
9235 isnumok = 0;
9236 if (PyNumber_Check(v)) {
9237 PyObject *iobj=NULL;
9238
9239 if (PyLong_Check(v)) {
9240 iobj = v;
9241 Py_INCREF(iobj);
9242 }
9243 else {
9244 iobj = PyNumber_Long(v);
9245 }
9246 if (iobj!=NULL) {
9247 if (PyLong_Check(iobj)) {
9248 isnumok = 1;
9249 temp = formatlong(iobj, flags, prec, c);
9250 Py_DECREF(iobj);
9251 if (!temp)
9252 goto onError;
9253 pbuf = PyUnicode_AS_UNICODE(temp);
9254 len = PyUnicode_GET_SIZE(temp);
9255 sign = 1;
9256 }
9257 else {
9258 Py_DECREF(iobj);
9259 }
9260 }
9261 }
9262 if (!isnumok) {
9263 PyErr_Format(PyExc_TypeError,
9264 "%%%c format: a number is required, "
9265 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9266 goto onError;
9267 }
9268 if (flags & F_ZERO)
9269 fill = '0';
9270 break;
9271
9272 case 'e':
9273 case 'E':
9274 case 'f':
9275 case 'F':
9276 case 'g':
9277 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009278 temp = formatfloat(v, flags, prec, c);
9279 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009280 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009281 pbuf = PyUnicode_AS_UNICODE(temp);
9282 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009283 sign = 1;
9284 if (flags & F_ZERO)
9285 fill = '0';
9286 break;
9287
9288 case 'c':
9289 pbuf = formatbuf;
9290 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9291 if (len < 0)
9292 goto onError;
9293 break;
9294
9295 default:
9296 PyErr_Format(PyExc_ValueError,
9297 "unsupported format character '%c' (0x%x) "
9298 "at index %zd",
9299 (31<=c && c<=126) ? (char)c : '?',
9300 (int)c,
9301 (Py_ssize_t)(fmt - 1 -
9302 PyUnicode_AS_UNICODE(uformat)));
9303 goto onError;
9304 }
9305 if (sign) {
9306 if (*pbuf == '-' || *pbuf == '+') {
9307 sign = *pbuf++;
9308 len--;
9309 }
9310 else if (flags & F_SIGN)
9311 sign = '+';
9312 else if (flags & F_BLANK)
9313 sign = ' ';
9314 else
9315 sign = 0;
9316 }
9317 if (width < len)
9318 width = len;
9319 if (rescnt - (sign != 0) < width) {
9320 reslen -= rescnt;
9321 rescnt = width + fmtcnt + 100;
9322 reslen += rescnt;
9323 if (reslen < 0) {
9324 Py_XDECREF(temp);
9325 PyErr_NoMemory();
9326 goto onError;
9327 }
9328 if (_PyUnicode_Resize(&result, reslen) < 0) {
9329 Py_XDECREF(temp);
9330 goto onError;
9331 }
9332 res = PyUnicode_AS_UNICODE(result)
9333 + reslen - rescnt;
9334 }
9335 if (sign) {
9336 if (fill != ' ')
9337 *res++ = sign;
9338 rescnt--;
9339 if (width > len)
9340 width--;
9341 }
9342 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9343 assert(pbuf[0] == '0');
9344 assert(pbuf[1] == c);
9345 if (fill != ' ') {
9346 *res++ = *pbuf++;
9347 *res++ = *pbuf++;
9348 }
9349 rescnt -= 2;
9350 width -= 2;
9351 if (width < 0)
9352 width = 0;
9353 len -= 2;
9354 }
9355 if (width > len && !(flags & F_LJUST)) {
9356 do {
9357 --rescnt;
9358 *res++ = fill;
9359 } while (--width > len);
9360 }
9361 if (fill == ' ') {
9362 if (sign)
9363 *res++ = sign;
9364 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9365 assert(pbuf[0] == '0');
9366 assert(pbuf[1] == c);
9367 *res++ = *pbuf++;
9368 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009369 }
9370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 Py_UNICODE_COPY(res, pbuf, len);
9372 res += len;
9373 rescnt -= len;
9374 while (--width >= len) {
9375 --rescnt;
9376 *res++ = ' ';
9377 }
9378 if (dict && (argidx < arglen) && c != '%') {
9379 PyErr_SetString(PyExc_TypeError,
9380 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009381 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 goto onError;
9383 }
9384 Py_XDECREF(temp);
9385 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 } /* until end */
9387 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 PyErr_SetString(PyExc_TypeError,
9389 "not all arguments converted during string formatting");
9390 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 }
9392
Thomas Woutersa96affe2006-03-12 00:29:36 +00009393 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397 }
9398 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 return (PyObject *)result;
9400
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402 Py_XDECREF(result);
9403 Py_DECREF(uformat);
9404 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 }
9407 return NULL;
9408}
9409
Jeremy Hylton938ace62002-07-17 16:30:39 +00009410static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009411unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9412
Tim Peters6d6c1a32001-08-02 04:15:00 +00009413static PyObject *
9414unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9415{
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009417 static char *kwlist[] = {"object", "encoding", "errors", 0};
9418 char *encoding = NULL;
9419 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009420
Benjamin Peterson14339b62009-01-31 16:36:08 +00009421 if (type != &PyUnicode_Type)
9422 return unicode_subtype_new(type, args, kwds);
9423 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009425 return NULL;
9426 if (x == NULL)
9427 return (PyObject *)_PyUnicode_New(0);
9428 if (encoding == NULL && errors == NULL)
9429 return PyObject_Str(x);
9430 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009432}
9433
Guido van Rossume023fe02001-08-30 03:12:59 +00009434static PyObject *
9435unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9436{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009437 PyUnicodeObject *tmp, *pnew;
9438 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009439
Benjamin Peterson14339b62009-01-31 16:36:08 +00009440 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9441 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9442 if (tmp == NULL)
9443 return NULL;
9444 assert(PyUnicode_Check(tmp));
9445 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9446 if (pnew == NULL) {
9447 Py_DECREF(tmp);
9448 return NULL;
9449 }
9450 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9451 if (pnew->str == NULL) {
9452 _Py_ForgetReference((PyObject *)pnew);
9453 PyObject_Del(pnew);
9454 Py_DECREF(tmp);
9455 return PyErr_NoMemory();
9456 }
9457 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9458 pnew->length = n;
9459 pnew->hash = tmp->hash;
9460 Py_DECREF(tmp);
9461 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009462}
9463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009464PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009466\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009467Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009468encoding defaults to the current default string encoding.\n\
9469errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009470
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009471static PyObject *unicode_iter(PyObject *seq);
9472
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009474 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009475 "str", /* tp_name */
9476 sizeof(PyUnicodeObject), /* tp_size */
9477 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009479 (destructor)unicode_dealloc, /* tp_dealloc */
9480 0, /* tp_print */
9481 0, /* tp_getattr */
9482 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009483 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009484 unicode_repr, /* tp_repr */
9485 &unicode_as_number, /* tp_as_number */
9486 &unicode_as_sequence, /* tp_as_sequence */
9487 &unicode_as_mapping, /* tp_as_mapping */
9488 (hashfunc) unicode_hash, /* tp_hash*/
9489 0, /* tp_call*/
9490 (reprfunc) unicode_str, /* tp_str */
9491 PyObject_GenericGetAttr, /* tp_getattro */
9492 0, /* tp_setattro */
9493 0, /* tp_as_buffer */
9494 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009496 unicode_doc, /* tp_doc */
9497 0, /* tp_traverse */
9498 0, /* tp_clear */
9499 PyUnicode_RichCompare, /* tp_richcompare */
9500 0, /* tp_weaklistoffset */
9501 unicode_iter, /* tp_iter */
9502 0, /* tp_iternext */
9503 unicode_methods, /* tp_methods */
9504 0, /* tp_members */
9505 0, /* tp_getset */
9506 &PyBaseObject_Type, /* tp_base */
9507 0, /* tp_dict */
9508 0, /* tp_descr_get */
9509 0, /* tp_descr_set */
9510 0, /* tp_dictoffset */
9511 0, /* tp_init */
9512 0, /* tp_alloc */
9513 unicode_new, /* tp_new */
9514 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515};
9516
9517/* Initialize the Unicode implementation */
9518
Thomas Wouters78890102000-07-22 19:25:51 +00009519void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009521 int i;
9522
Thomas Wouters477c8d52006-05-27 19:21:47 +00009523 /* XXX - move this array to unicodectype.c ? */
9524 Py_UNICODE linebreak[] = {
9525 0x000A, /* LINE FEED */
9526 0x000D, /* CARRIAGE RETURN */
9527 0x001C, /* FILE SEPARATOR */
9528 0x001D, /* GROUP SEPARATOR */
9529 0x001E, /* RECORD SEPARATOR */
9530 0x0085, /* NEXT LINE */
9531 0x2028, /* LINE SEPARATOR */
9532 0x2029, /* PARAGRAPH SEPARATOR */
9533 };
9534
Fred Drakee4315f52000-05-09 19:53:39 +00009535 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009536 free_list = NULL;
9537 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009539 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009541
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009542 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009544 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009545 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009546
9547 /* initialize the linebreak bloom filter */
9548 bloom_linebreak = make_bloom_mask(
9549 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9550 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009551
9552 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553}
9554
9555/* Finalize the Unicode implementation */
9556
Christian Heimesa156e092008-02-16 07:38:31 +00009557int
9558PyUnicode_ClearFreeList(void)
9559{
9560 int freelist_size = numfree;
9561 PyUnicodeObject *u;
9562
9563 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 PyUnicodeObject *v = u;
9565 u = *(PyUnicodeObject **)u;
9566 if (v->str)
9567 PyObject_DEL(v->str);
9568 Py_XDECREF(v->defenc);
9569 PyObject_Del(v);
9570 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009571 }
9572 free_list = NULL;
9573 assert(numfree == 0);
9574 return freelist_size;
9575}
9576
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577void
Thomas Wouters78890102000-07-22 19:25:51 +00009578_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009580 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009582 Py_XDECREF(unicode_empty);
9583 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009584
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009585 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 if (unicode_latin1[i]) {
9587 Py_DECREF(unicode_latin1[i]);
9588 unicode_latin1[i] = NULL;
9589 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009590 }
Christian Heimesa156e092008-02-16 07:38:31 +00009591 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009593
Walter Dörwald16807132007-05-25 13:52:07 +00009594void
9595PyUnicode_InternInPlace(PyObject **p)
9596{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009597 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9598 PyObject *t;
9599 if (s == NULL || !PyUnicode_Check(s))
9600 Py_FatalError(
9601 "PyUnicode_InternInPlace: unicode strings only please!");
9602 /* If it's a subclass, we don't really know what putting
9603 it in the interned dict might do. */
9604 if (!PyUnicode_CheckExact(s))
9605 return;
9606 if (PyUnicode_CHECK_INTERNED(s))
9607 return;
9608 if (interned == NULL) {
9609 interned = PyDict_New();
9610 if (interned == NULL) {
9611 PyErr_Clear(); /* Don't leave an exception */
9612 return;
9613 }
9614 }
9615 /* It might be that the GetItem call fails even
9616 though the key is present in the dictionary,
9617 namely when this happens during a stack overflow. */
9618 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009620 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009621
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 if (t) {
9623 Py_INCREF(t);
9624 Py_DECREF(*p);
9625 *p = t;
9626 return;
9627 }
Walter Dörwald16807132007-05-25 13:52:07 +00009628
Benjamin Peterson14339b62009-01-31 16:36:08 +00009629 PyThreadState_GET()->recursion_critical = 1;
9630 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9631 PyErr_Clear();
9632 PyThreadState_GET()->recursion_critical = 0;
9633 return;
9634 }
9635 PyThreadState_GET()->recursion_critical = 0;
9636 /* The two references in interned are not counted by refcnt.
9637 The deallocator will take care of this */
9638 Py_REFCNT(s) -= 2;
9639 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009640}
9641
9642void
9643PyUnicode_InternImmortal(PyObject **p)
9644{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 PyUnicode_InternInPlace(p);
9646 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9647 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9648 Py_INCREF(*p);
9649 }
Walter Dörwald16807132007-05-25 13:52:07 +00009650}
9651
9652PyObject *
9653PyUnicode_InternFromString(const char *cp)
9654{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 PyObject *s = PyUnicode_FromString(cp);
9656 if (s == NULL)
9657 return NULL;
9658 PyUnicode_InternInPlace(&s);
9659 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009660}
9661
9662void _Py_ReleaseInternedUnicodeStrings(void)
9663{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009664 PyObject *keys;
9665 PyUnicodeObject *s;
9666 Py_ssize_t i, n;
9667 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009668
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 if (interned == NULL || !PyDict_Check(interned))
9670 return;
9671 keys = PyDict_Keys(interned);
9672 if (keys == NULL || !PyList_Check(keys)) {
9673 PyErr_Clear();
9674 return;
9675 }
Walter Dörwald16807132007-05-25 13:52:07 +00009676
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9678 detector, interned unicode strings are not forcibly deallocated;
9679 rather, we give them their stolen references back, and then clear
9680 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009681
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682 n = PyList_GET_SIZE(keys);
9683 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 for (i = 0; i < n; i++) {
9686 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9687 switch (s->state) {
9688 case SSTATE_NOT_INTERNED:
9689 /* XXX Shouldn't happen */
9690 break;
9691 case SSTATE_INTERNED_IMMORTAL:
9692 Py_REFCNT(s) += 1;
9693 immortal_size += s->length;
9694 break;
9695 case SSTATE_INTERNED_MORTAL:
9696 Py_REFCNT(s) += 2;
9697 mortal_size += s->length;
9698 break;
9699 default:
9700 Py_FatalError("Inconsistent interned string state.");
9701 }
9702 s->state = SSTATE_NOT_INTERNED;
9703 }
9704 fprintf(stderr, "total size of all interned strings: "
9705 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9706 "mortal/immortal\n", mortal_size, immortal_size);
9707 Py_DECREF(keys);
9708 PyDict_Clear(interned);
9709 Py_DECREF(interned);
9710 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009711}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009712
9713
9714/********************* Unicode Iterator **************************/
9715
9716typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009717 PyObject_HEAD
9718 Py_ssize_t it_index;
9719 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009720} unicodeiterobject;
9721
9722static void
9723unicodeiter_dealloc(unicodeiterobject *it)
9724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 _PyObject_GC_UNTRACK(it);
9726 Py_XDECREF(it->it_seq);
9727 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009728}
9729
9730static int
9731unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9732{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 Py_VISIT(it->it_seq);
9734 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009735}
9736
9737static PyObject *
9738unicodeiter_next(unicodeiterobject *it)
9739{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009740 PyUnicodeObject *seq;
9741 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009742
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 assert(it != NULL);
9744 seq = it->it_seq;
9745 if (seq == NULL)
9746 return NULL;
9747 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009748
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9750 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009752 if (item != NULL)
9753 ++it->it_index;
9754 return item;
9755 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009756
Benjamin Peterson14339b62009-01-31 16:36:08 +00009757 Py_DECREF(seq);
9758 it->it_seq = NULL;
9759 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009760}
9761
9762static PyObject *
9763unicodeiter_len(unicodeiterobject *it)
9764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 Py_ssize_t len = 0;
9766 if (it->it_seq)
9767 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9768 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009769}
9770
9771PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9772
9773static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009774 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009776 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009777};
9778
9779PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9781 "str_iterator", /* tp_name */
9782 sizeof(unicodeiterobject), /* tp_basicsize */
9783 0, /* tp_itemsize */
9784 /* methods */
9785 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9786 0, /* tp_print */
9787 0, /* tp_getattr */
9788 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009789 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009790 0, /* tp_repr */
9791 0, /* tp_as_number */
9792 0, /* tp_as_sequence */
9793 0, /* tp_as_mapping */
9794 0, /* tp_hash */
9795 0, /* tp_call */
9796 0, /* tp_str */
9797 PyObject_GenericGetAttr, /* tp_getattro */
9798 0, /* tp_setattro */
9799 0, /* tp_as_buffer */
9800 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9801 0, /* tp_doc */
9802 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9803 0, /* tp_clear */
9804 0, /* tp_richcompare */
9805 0, /* tp_weaklistoffset */
9806 PyObject_SelfIter, /* tp_iter */
9807 (iternextfunc)unicodeiter_next, /* tp_iternext */
9808 unicodeiter_methods, /* tp_methods */
9809 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009810};
9811
9812static PyObject *
9813unicode_iter(PyObject *seq)
9814{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009815 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009816
Benjamin Peterson14339b62009-01-31 16:36:08 +00009817 if (!PyUnicode_Check(seq)) {
9818 PyErr_BadInternalCall();
9819 return NULL;
9820 }
9821 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9822 if (it == NULL)
9823 return NULL;
9824 it->it_index = 0;
9825 Py_INCREF(seq);
9826 it->it_seq = (PyUnicodeObject *)seq;
9827 _PyObject_GC_TRACK(it);
9828 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009829}
9830
Martin v. Löwis5b222132007-06-10 09:51:05 +00009831size_t
9832Py_UNICODE_strlen(const Py_UNICODE *u)
9833{
9834 int res = 0;
9835 while(*u++)
9836 res++;
9837 return res;
9838}
9839
9840Py_UNICODE*
9841Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9842{
9843 Py_UNICODE *u = s1;
9844 while ((*u++ = *s2++));
9845 return s1;
9846}
9847
9848Py_UNICODE*
9849Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9850{
9851 Py_UNICODE *u = s1;
9852 while ((*u++ = *s2++))
9853 if (n-- == 0)
9854 break;
9855 return s1;
9856}
9857
9858int
9859Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9860{
9861 while (*s1 && *s2 && *s1 == *s2)
9862 s1++, s2++;
9863 if (*s1 && *s2)
9864 return (*s1 < *s2) ? -1 : +1;
9865 if (*s1)
9866 return 1;
9867 if (*s2)
9868 return -1;
9869 return 0;
9870}
9871
9872Py_UNICODE*
9873Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9874{
9875 const Py_UNICODE *p;
9876 for (p = s; *p; p++)
9877 if (*p == c)
9878 return (Py_UNICODE*)p;
9879 return NULL;
9880}
9881
9882
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009883#ifdef __cplusplus
9884}
9885#endif