blob: d8dab672b6061158c6ff2f7a136049da1f28437e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson339f8c62009-01-31 22:25:08 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000247 if (unicode == unicode_empty ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
Georg Brandl6290bcf2010-08-01 21:48:47 +0000275 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 }
277 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return 0;
280}
281
282/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000283 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
285 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000286 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288*/
289
290static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000291PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292{
293 register PyUnicodeObject *unicode;
294
Andrew Dalkee0df7622006-05-27 11:04:36 +0000295 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 if (length == 0 && unicode_empty != NULL) {
297 Py_INCREF(unicode_empty);
298 return unicode_empty;
299 }
300
Neal Norwitze7d8be82008-07-31 17:17:14 +0000301 /* Ensure we won't overflow the size. */
302 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
303 return (PyUnicodeObject *)PyErr_NoMemory();
304 }
305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000307 if (free_list) {
308 unicode = free_list;
309 free_list = *(PyUnicodeObject **)unicode;
310 numfree--;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000311 if (unicode->str) {
312 /* Keep-Alive optimization: we only upsize the buffer,
313 never downsize it. */
314 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000315 unicode_resize(unicode, length) < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000316 PyObject_DEL(unicode->str);
317 unicode->str = NULL;
318 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000319 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000320 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000321 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
322 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000323 }
324 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 }
326 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000327 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000328 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 if (unicode == NULL)
330 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000331 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000335 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000336 PyErr_NoMemory();
337 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000338 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000339 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000340 * the caller fails before initializing str -- unicode_resize()
341 * reads str[0], and the Keep-Alive optimization can keep memory
342 * allocated for str alive across a call to unicode_dealloc(unicode).
343 * We don't want unicode_resize to read uninitialized memory in
344 * that case.
345 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000346 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000348 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000350 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000352
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000353 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000354 /* XXX UNREF/NEWREF interface should be more symmetrical */
355 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000356 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000357 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359}
360
361static
Guido van Rossum9475a232001-10-05 20:51:39 +0000362void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000364 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000365 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000366 /* Keep-Alive optimization */
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000367 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
368 PyObject_DEL(unicode->str);
369 unicode->str = NULL;
370 unicode->length = 0;
371 }
372 if (unicode->defenc) {
Georg Brandl6290bcf2010-08-01 21:48:47 +0000373 Py_CLEAR(unicode->defenc);
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000374 }
375 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000376 *(PyUnicodeObject **)unicode = free_list;
377 free_list = unicode;
378 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 }
380 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000381 PyObject_DEL(unicode->str);
382 Py_XDECREF(unicode->defenc);
383 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 }
385}
386
Benjamin Peterson828a7062008-12-27 17:05:29 +0000387static
388int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389{
390 register PyUnicodeObject *v;
391
392 /* Argument checks */
393 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000394 PyErr_BadInternalCall();
395 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000396 }
Benjamin Peterson828a7062008-12-27 17:05:29 +0000397 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000398 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000399 PyErr_BadInternalCall();
400 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000401 }
402
403 /* Resizing unicode_empty and single character objects is not
404 possible since these are being shared. We simply return a fresh
405 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000406 if (v->length != length &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000407 (v == unicode_empty || v->length == 1)) {
408 PyUnicodeObject *w = _PyUnicode_New(length);
409 if (w == NULL)
410 return -1;
411 Py_UNICODE_COPY(w->str, v->str,
412 length < v->length ? length : v->length);
413 Py_DECREF(*unicode);
414 *unicode = w;
415 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Note that we don't have to modify *unicode for unshared Unicode
419 objects, since we can modify them in-place. */
420 return unicode_resize(v, length);
421}
422
Benjamin Peterson828a7062008-12-27 17:05:29 +0000423int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
424{
425 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
426}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000429 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430{
431 PyUnicodeObject *unicode;
432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 /* If the Unicode data is known at construction time, we can apply
434 some optimizations which share commonly used objects. */
435 if (u != NULL) {
436
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000437 /* Optimization for empty strings */
438 if (size == 0 && unicode_empty != NULL) {
439 Py_INCREF(unicode_empty);
440 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000441 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000442
443 /* Single character Unicode objects in the Latin-1 range are
444 shared when using this constructor */
445 if (size == 1 && *u < 256) {
446 unicode = unicode_latin1[*u];
447 if (!unicode) {
448 unicode = _PyUnicode_New(1);
449 if (!unicode)
450 return NULL;
451 unicode->str[0] = *u;
452 unicode_latin1[*u] = unicode;
453 }
454 Py_INCREF(unicode);
455 return (PyObject *)unicode;
456 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
Tim Petersced69f82003-09-16 20:30:58 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 unicode = _PyUnicode_New(size);
460 if (!unicode)
461 return NULL;
462
463 /* Copy the Unicode data into the new object */
464 if (u != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000465 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000466
467 return (PyObject *)unicode;
468}
469
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000470PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
471{
472 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000473
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000474 if (size < 0) {
475 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000476 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000477 return NULL;
478 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000479
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects.
482 Also, this means the input must be UTF-8, so fall back to the
483 UTF-8 decoder at the end. */
484 if (u != NULL) {
485
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000490 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000491
492 /* Single characters are shared when using this constructor.
493 Restrict to ASCII, since the input must be UTF-8. */
494 if (size == 1 && Py_CHARMASK(*u) < 128) {
495 unicode = unicode_latin1[Py_CHARMASK(*u)];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = Py_CHARMASK(*u);
501 unicode_latin1[Py_CHARMASK(*u)] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000506
507 return PyUnicode_DecodeUTF8(u, size, NULL);
508 }
509
510 unicode = _PyUnicode_New(size);
511 if (!unicode)
512 return NULL;
513
514 return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromString(const char *u)
518{
519 size_t size = strlen(u);
520 if (size > PY_SSIZE_T_MAX) {
521 PyErr_SetString(PyExc_OverflowError, "input too long");
522 return NULL;
523 }
524
525 return PyUnicode_FromStringAndSize(u, size);
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528#ifdef HAVE_WCHAR_H
529
530PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000531 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532{
533 PyUnicodeObject *unicode;
534
535 if (w == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000536 PyErr_BadInternalCall();
537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538 }
539
540 unicode = _PyUnicode_New(size);
541 if (!unicode)
542 return NULL;
543
544 /* Copy the wchar_t data into the new object */
545#ifdef HAVE_USABLE_WCHAR_T
546 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000547#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000549 register Py_UNICODE *u;
550 register Py_ssize_t i;
551 u = PyUnicode_AS_UNICODE(unicode);
552 for (i = size; i > 0; i--)
553 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 }
555#endif
556
557 return (PyObject *)unicode;
558}
559
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000560static void
561makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
562{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000563 *fmt++ = '%';
564 if (width) {
565 if (zeropad)
566 *fmt++ = '0';
567 fmt += sprintf(fmt, "%d", width);
568 }
569 if (precision)
570 fmt += sprintf(fmt, ".%d", precision);
571 if (longflag)
572 *fmt++ = 'l';
573 else if (size_tflag) {
574 char *f = PY_FORMAT_SIZE_T;
575 while (*f)
576 *fmt++ = *f++;
577 }
578 *fmt++ = c;
579 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000580}
581
582#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
583
584PyObject *
585PyUnicode_FromFormatV(const char *format, va_list vargs)
586{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000587 va_list count;
588 Py_ssize_t callcount = 0;
589 PyObject **callresults = NULL;
590 PyObject **callresult = NULL;
591 Py_ssize_t n = 0;
592 int width = 0;
593 int precision = 0;
594 int zeropad;
595 const char* f;
596 Py_UNICODE *s;
597 PyObject *string;
598 /* used by sprintf */
599 char buffer[21];
600 /* use abuffer instead of buffer, if we need more space
601 * (which can happen if there's a format specifier with width). */
602 char *abuffer = NULL;
603 char *realbuffer;
604 Py_ssize_t abuffersize = 0;
605 char fmt[60]; /* should be enough for %0width.precisionld */
606 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000607
608#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000609 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000610#else
611#ifdef __va_copy
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000612 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000613#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000614 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#endif
616#endif
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000617 /* step 1: count the number of %S/%R/%s format specifications
618 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
619 * objects once during step 3 and put the result in an array) */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000620 for (f = format; *f; f++) {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000621 if (*f == '%') {
622 if (*(f+1)=='%')
623 continue;
Walter Dörwald67032252009-05-03 22:46:50 +0000624 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000625 ++callcount;
626 while (isdigit((unsigned)*f))
627 width = (width*10) + *f++ - '0';
628 while (*++f && *f != '%' && !isalpha((unsigned)*f))
629 ;
630 if (*f == 's')
631 ++callcount;
632 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000633 }
634 /* step 2: allocate memory for the results of
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000635 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000636 if (callcount) {
637 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
638 if (!callresults) {
639 PyErr_NoMemory();
640 return NULL;
641 }
642 callresult = callresults;
643 }
644 /* step 3: figure out how large a buffer we need */
645 for (f = format; *f; f++) {
646 if (*f == '%') {
647 const char* p = f;
648 width = 0;
649 while (isdigit((unsigned)*f))
650 width = (width*10) + *f++ - '0';
651 while (*++f && *f != '%' && !isalpha((unsigned)*f))
652 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000654 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
655 * they don't affect the amount of space we reserve.
656 */
657 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000658 (f[1] == 'd' || f[1] == 'u'))
659 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000661 switch (*f) {
662 case 'c':
663 (void)va_arg(count, int);
664 /* fall through... */
665 case '%':
666 n++;
667 break;
668 case 'd': case 'u': case 'i': case 'x':
669 (void) va_arg(count, int);
670 /* 20 bytes is enough to hold a 64-bit
671 integer. Decimal takes the most space.
672 This isn't enough for octal.
673 If a width is specified we need more
674 (which we allocate later). */
675 if (width < 20)
676 width = 20;
677 n += width;
678 if (abuffersize < width)
679 abuffersize = width;
680 break;
681 case 's':
682 {
683 /* UTF-8 */
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000684 unsigned char *s = va_arg(count, unsigned char*);
685 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
686 if (!str)
687 goto fail;
688 n += PyUnicode_GET_SIZE(str);
689 /* Remember the str and switch to the next slot */
690 *callresult++ = str;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000691 break;
692 }
693 case 'U':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 assert(obj && PyUnicode_Check(obj));
697 n += PyUnicode_GET_SIZE(obj);
698 break;
699 }
700 case 'V':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 const char *str = va_arg(count, const char *);
704 assert(obj || str);
705 assert(!obj || PyUnicode_Check(obj));
706 if (obj)
707 n += PyUnicode_GET_SIZE(obj);
708 else
709 n += strlen(str);
710 break;
711 }
712 case 'S':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 PyObject *str;
716 assert(obj);
717 str = PyObject_Str(obj);
718 if (!str)
719 goto fail;
720 n += PyUnicode_GET_SIZE(str);
721 /* Remember the str and switch to the next slot */
722 *callresult++ = str;
723 break;
724 }
725 case 'R':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 PyObject *repr;
729 assert(obj);
730 repr = PyObject_Repr(obj);
731 if (!repr)
732 goto fail;
733 n += PyUnicode_GET_SIZE(repr);
734 /* Remember the repr and switch to the next slot */
735 *callresult++ = repr;
736 break;
737 }
738 case 'p':
739 (void) va_arg(count, int);
740 /* maximum 64-bit pointer representation:
741 * 0xffffffffffffffff
742 * so 19 characters is enough.
743 * XXX I count 18 -- what's the extra for?
744 */
745 n += 19;
746 break;
747 default:
748 /* if we stumble upon an unknown
749 formatting code, copy the rest of
750 the format string to the output
751 string. (we cannot just skip the
752 code, since there's no way to know
753 what's in the argument list) */
754 n += strlen(p);
755 goto expand;
756 }
757 } else
758 n++;
759 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000760 expand:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000761 if (abuffersize > 20) {
762 abuffer = PyObject_Malloc(abuffersize);
763 if (!abuffer) {
764 PyErr_NoMemory();
765 goto fail;
766 }
767 realbuffer = abuffer;
768 }
769 else
770 realbuffer = buffer;
771 /* step 4: fill the buffer */
772 /* Since we've analyzed how much space we need for the worst case,
773 we don't have to resize the string.
774 There can be no errors beyond this point. */
775 string = PyUnicode_FromUnicode(NULL, n);
776 if (!string)
777 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000778
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000779 s = PyUnicode_AS_UNICODE(string);
780 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000781
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000782 for (f = format; *f; f++) {
783 if (*f == '%') {
784 const char* p = f++;
785 int longflag = 0;
786 int size_tflag = 0;
787 zeropad = (*f == '0');
788 /* parse the width.precision part */
789 width = 0;
790 while (isdigit((unsigned)*f))
791 width = (width*10) + *f++ - '0';
792 precision = 0;
793 if (*f == '.') {
794 f++;
795 while (isdigit((unsigned)*f))
796 precision = (precision*10) + *f++ - '0';
797 }
798 /* handle the long flag, but only for %ld and %lu.
799 others can be added when necessary. */
800 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
801 longflag = 1;
802 ++f;
803 }
804 /* handle the size_t flag. */
805 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
806 size_tflag = 1;
807 ++f;
808 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000809
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000810 switch (*f) {
811 case 'c':
812 *s++ = va_arg(vargs, int);
813 break;
814 case 'd':
815 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
816 if (longflag)
817 sprintf(realbuffer, fmt, va_arg(vargs, long));
818 else if (size_tflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
820 else
821 sprintf(realbuffer, fmt, va_arg(vargs, int));
822 appendstring(realbuffer);
823 break;
824 case 'u':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
832 appendstring(realbuffer);
833 break;
834 case 'i':
835 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
836 sprintf(realbuffer, fmt, va_arg(vargs, int));
837 appendstring(realbuffer);
838 break;
839 case 'x':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 's':
845 {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000846 /* unused, since we already have the result */
847 (void) va_arg(vargs, char *);
848 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
849 PyUnicode_GET_SIZE(*callresult));
850 s += PyUnicode_GET_SIZE(*callresult);
851 /* We're done with the unicode()/repr() => forget it */
852 Py_DECREF(*callresult);
853 /* switch to next unicode()/repr() result */
854 ++callresult;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000855 break;
856 }
857 case 'U':
858 {
859 PyObject *obj = va_arg(vargs, PyObject *);
860 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
861 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
862 s += size;
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(vargs, PyObject *);
868 const char *str = va_arg(vargs, const char *);
869 if (obj) {
870 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
871 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
872 s += size;
873 } else {
874 appendstring(str);
875 }
876 break;
877 }
878 case 'S':
879 case 'R':
880 {
881 Py_UNICODE *ucopy;
882 Py_ssize_t usize;
883 Py_ssize_t upos;
884 /* unused, since we already have the result */
885 (void) va_arg(vargs, PyObject *);
886 ucopy = PyUnicode_AS_UNICODE(*callresult);
887 usize = PyUnicode_GET_SIZE(*callresult);
888 for (upos = 0; upos<usize;)
889 *s++ = ucopy[upos++];
890 /* We're done with the unicode()/repr() => forget it */
891 Py_DECREF(*callresult);
892 /* switch to next unicode()/repr() result */
893 ++callresult;
894 break;
895 }
896 case 'p':
897 sprintf(buffer, "%p", va_arg(vargs, void*));
898 /* %p is ill-defined: ensure leading 0x. */
899 if (buffer[1] == 'X')
900 buffer[1] = 'x';
901 else if (buffer[1] != 'x') {
902 memmove(buffer+2, buffer, strlen(buffer)+1);
903 buffer[0] = '0';
904 buffer[1] = 'x';
905 }
906 appendstring(buffer);
907 break;
908 case '%':
909 *s++ = '%';
910 break;
911 default:
912 appendstring(p);
913 goto end;
914 }
915 } else
916 *s++ = *f;
917 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000919 end:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000920 if (callresults)
921 PyObject_Free(callresults);
922 if (abuffer)
923 PyObject_Free(abuffer);
924 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
925 return string;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000926 fail:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000927 if (callresults) {
928 PyObject **callresult2 = callresults;
929 while (callresult2 < callresult) {
930 Py_DECREF(*callresult2);
931 ++callresult2;
932 }
933 PyObject_Free(callresults);
934 }
935 if (abuffer)
936 PyObject_Free(abuffer);
937 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938}
939
940#undef appendstring
941
942PyObject *
943PyUnicode_FromFormat(const char *format, ...)
944{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000945 PyObject* ret;
946 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000947
948#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000949 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000950#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000951 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000953 ret = PyUnicode_FromFormatV(format, vargs);
954 va_end(vargs);
955 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000956}
957
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000959 wchar_t *w,
960 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961{
962 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000963 PyErr_BadInternalCall();
964 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000966
967 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000968 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000969 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971#ifdef HAVE_USABLE_WCHAR_T
972 memcpy(w, unicode->str, size * sizeof(wchar_t));
973#else
974 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000975 register Py_UNICODE *u;
976 register Py_ssize_t i;
977 u = PyUnicode_AS_UNICODE(unicode);
978 for (i = size; i > 0; i--)
979 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 }
981#endif
982
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000983 if (size > PyUnicode_GET_SIZE(unicode))
984 return PyUnicode_GET_SIZE(unicode);
985 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000986 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987}
988
989#endif
990
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000991PyObject *PyUnicode_FromOrdinal(int ordinal)
992{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000993 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000994
995#ifdef Py_UNICODE_WIDE
996 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000997 PyErr_SetString(PyExc_ValueError,
998 "unichr() arg not in range(0x110000) "
999 "(wide Python build)");
1000 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001001 }
1002#else
1003 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001004 PyErr_SetString(PyExc_ValueError,
1005 "unichr() arg not in range(0x10000) "
1006 "(narrow Python build)");
1007 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001008 }
1009#endif
1010
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 s[0] = (Py_UNICODE)ordinal;
1012 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001013}
1014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015PyObject *PyUnicode_FromObject(register PyObject *obj)
1016{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001018 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001019 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001020 Py_INCREF(obj);
1021 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001022 }
1023 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001024 /* For a Unicode subtype that's not a Unicode object,
1025 return a true Unicode object with the same data. */
1026 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1027 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001029 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1030}
1031
1032PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001033 const char *encoding,
1034 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001035{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001039
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (obj == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001041 PyErr_BadInternalCall();
1042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001045#if 0
1046 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001047 that no encodings is given and then redirect to
1048 PyObject_Unicode() which then applies the additional logic for
1049 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001050
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001051 NOTE: This API should really only be used for object which
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001052 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001053
1054 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001055 if (PyUnicode_Check(obj)) {
1056 if (encoding) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001057 PyErr_SetString(PyExc_TypeError,
1058 "decoding Unicode is not supported");
1059 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001060 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001061 return PyObject_Unicode(obj);
1062 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#else
1064 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001065 PyErr_SetString(PyExc_TypeError,
1066 "decoding Unicode is not supported");
1067 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069#endif
1070
1071 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001072 if (PyString_Check(obj)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001073 s = PyString_AS_STRING(obj);
1074 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001075 }
Christian Heimes3497f942008-05-26 12:29:14 +00001076 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001077 /* Python 2.x specific */
1078 PyErr_Format(PyExc_TypeError,
1079 "decoding bytearray is not supported");
1080 return NULL;
1081 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001082 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001083 /* Overwrite the error message with something more useful in
1084 case of a TypeError. */
1085 if (PyErr_ExceptionMatches(PyExc_TypeError))
1086 PyErr_Format(PyExc_TypeError,
1087 "coercing to Unicode: need string or buffer, "
1088 "%.80s found",
1089 Py_TYPE(obj)->tp_name);
1090 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 if (len == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001095 Py_INCREF(unicode_empty);
1096 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Tim Petersced69f82003-09-16 20:30:58 +00001098 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001099 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001100
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001101 return v;
1102
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001103 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105}
1106
1107PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001108 Py_ssize_t size,
1109 const char *encoding,
1110 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111{
1112 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001113
1114 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001115 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001116
1117 /* Shortcuts for common default encodings */
1118 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001120 else if (strcmp(encoding, "latin-1") == 0)
1121 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001122#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1123 else if (strcmp(encoding, "mbcs") == 0)
1124 return PyUnicode_DecodeMBCS(s, size, errors);
1125#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001126 else if (strcmp(encoding, "ascii") == 0)
1127 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128
1129 /* Decode via the codec registry */
1130 buffer = PyBuffer_FromMemory((void *)s, size);
1131 if (buffer == NULL)
1132 goto onError;
1133 unicode = PyCodec_Decode(buffer, encoding, errors);
1134 if (unicode == NULL)
1135 goto onError;
1136 if (!PyUnicode_Check(unicode)) {
1137 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001138 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001139 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 Py_DECREF(unicode);
1141 goto onError;
1142 }
1143 Py_DECREF(buffer);
1144 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001145
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 Py_XDECREF(buffer);
1148 return NULL;
1149}
1150
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001151PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1152 const char *encoding,
1153 const char *errors)
1154{
1155 PyObject *v;
1156
1157 if (!PyUnicode_Check(unicode)) {
1158 PyErr_BadArgument();
1159 goto onError;
1160 }
1161
1162 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001163 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001164
1165 /* Decode via the codec registry */
1166 v = PyCodec_Decode(unicode, encoding, errors);
1167 if (v == NULL)
1168 goto onError;
1169 return v;
1170
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001171 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001172 return NULL;
1173}
1174
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001176 Py_ssize_t size,
1177 const char *encoding,
1178 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179{
1180 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 unicode = PyUnicode_FromUnicode(s, size);
1183 if (unicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1186 Py_DECREF(unicode);
1187 return v;
1188}
1189
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001190PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v;
1195
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_BadArgument();
1198 goto onError;
1199 }
1200
1201 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001202 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001203
1204 /* Encode via the codec registry */
1205 v = PyCodec_Encode(unicode, encoding, errors);
1206 if (v == NULL)
1207 goto onError;
1208 return v;
1209
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001210 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001211 return NULL;
1212}
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1215 const char *encoding,
1216 const char *errors)
1217{
1218 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001219
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 if (!PyUnicode_Check(unicode)) {
1221 PyErr_BadArgument();
1222 goto onError;
1223 }
Fred Drakee4315f52000-05-09 19:53:39 +00001224
Tim Petersced69f82003-09-16 20:30:58 +00001225 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001226 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001227
1228 /* Shortcuts for common default encodings */
1229 if (errors == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001230 if (strcmp(encoding, "utf-8") == 0)
1231 return PyUnicode_AsUTF8String(unicode);
1232 else if (strcmp(encoding, "latin-1") == 0)
1233 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001234#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001235 else if (strcmp(encoding, "mbcs") == 0)
1236 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001237#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001238 else if (strcmp(encoding, "ascii") == 0)
1239 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Encode via the codec registry */
1243 v = PyCodec_Encode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001246 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001248 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001249 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 Py_DECREF(v);
1251 goto onError;
1252 }
1253 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001255 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 return NULL;
1257}
1258
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001259PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001260 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001261{
1262 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1263
1264 if (v)
1265 return v;
1266 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1267 if (v && errors == NULL)
1268 ((PyUnicodeObject *)unicode)->defenc = v;
1269 return v;
1270}
1271
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1273{
1274 if (!PyUnicode_Check(unicode)) {
1275 PyErr_BadArgument();
1276 goto onError;
1277 }
1278 return PyUnicode_AS_UNICODE(unicode);
1279
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001280 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 return NULL;
1282}
1283
Martin v. Löwis18e16552006-02-15 17:27:45 +00001284Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285{
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290 return PyUnicode_GET_SIZE(unicode);
1291
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 return -1;
1294}
1295
Thomas Wouters78890102000-07-22 19:25:51 +00001296const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001297{
1298 return unicode_default_encoding;
1299}
1300
1301int PyUnicode_SetDefaultEncoding(const char *encoding)
1302{
1303 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001304
Fred Drakee4315f52000-05-09 19:53:39 +00001305 /* Make sure the encoding is valid. As side effect, this also
1306 loads the encoding into the codec registry cache. */
1307 v = _PyCodec_Lookup(encoding);
1308 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001309 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001310 Py_DECREF(v);
1311 strncpy(unicode_default_encoding,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001312 encoding,
1313 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001314 return 0;
1315
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001316 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001317 return -1;
1318}
1319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001320/* error handling callback helper:
1321 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001322 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323 and adjust various state variables.
1324 return 0 on success, -1 on error
1325*/
1326
1327static
1328int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001329 const char *encoding, const char *reason,
1330 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1331 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1332 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001334 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335
1336 PyObject *restuple = NULL;
1337 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001338 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1339 Py_ssize_t requiredsize;
1340 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001342 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 int res = -1;
1344
1345 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001346 *errorHandler = PyCodec_LookupError(errors);
1347 if (*errorHandler == NULL)
1348 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 }
1350
1351 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001352 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001353 encoding, input, insize, *startinpos, *endinpos, reason);
1354 if (*exceptionObject == NULL)
1355 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356 }
1357 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001358 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1359 goto onError;
1360 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1361 goto onError;
1362 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1363 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 }
1365
1366 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1367 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001368 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00001370 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001372 }
1373 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 if (newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001376 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001377 if (newpos<0 || newpos>insize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001378 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1379 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381
1382 /* need more space? (at least enough for what we
1383 have+the replacement+the rest of the string (starting
1384 at the new input position), so we won't have to check space
1385 when there are no errors in the rest of the string) */
1386 repptr = PyUnicode_AS_UNICODE(repunicode);
1387 repsize = PyUnicode_GET_SIZE(repunicode);
1388 requiredsize = *outpos + repsize + insize-newpos;
1389 if (requiredsize > outsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001390 if (requiredsize<2*outsize)
1391 requiredsize = 2*outsize;
1392 if (_PyUnicode_Resize(output, requiredsize) < 0)
1393 goto onError;
1394 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 }
1396 *endinpos = newpos;
1397 *inptr = input + newpos;
1398 Py_UNICODE_COPY(*outptr, repptr, repsize);
1399 *outptr += repsize;
1400 *outpos += repsize;
1401 /* we made it! */
1402 res = 0;
1403
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001404 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001405 Py_XDECREF(restuple);
1406 return res;
1407}
1408
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001409/* --- UTF-7 Codec -------------------------------------------------------- */
1410
1411/* see RFC2152 for details */
1412
Tim Petersced69f82003-09-16 20:30:58 +00001413static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001414char utf7_special[128] = {
1415 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1416 encoded:
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001417 0 - not special
1418 1 - special
1419 2 - whitespace (optional)
1420 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1422 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1423 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1425 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1429
1430};
1431
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001432/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1433 warnings about the comparison always being false; since
1434 utf7_special[0] is 1, we can safely make that one comparison
1435 true */
1436
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001437#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001438 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001439 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440 (encodeO && (utf7_special[(c)] == 3)))
1441
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001442#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001443 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001444#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001445 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001446#define UB64(c) \
1447 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001448 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001450#define ENCODE(out, ch, bits) \
1451 while (bits >= 6) { \
1452 *out++ = B64(ch >> (bits-6)); \
1453 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 }
1455
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001456#define DECODE(out, ch, bits, surrogate) \
1457 while (bits >= 16) { \
1458 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1459 bits -= 16; \
1460 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001461 /* We have already generated an error for the high surrogate \
1462 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001463 surrogate = 0; \
1464 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001466 it in a 16-bit character */ \
1467 surrogate = 1; \
1468 errmsg = "code pairs are not supported"; \
1469 goto utf7Error; \
1470 } else { \
1471 *out++ = outCh; \
1472 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001475PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001476 Py_ssize_t size,
1477 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001479 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1480}
1481
1482PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001483 Py_ssize_t size,
1484 const char *errors,
1485 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001488 Py_ssize_t startinpos;
1489 Py_ssize_t endinpos;
1490 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491 const char *e;
1492 PyUnicodeObject *unicode;
1493 Py_UNICODE *p;
1494 const char *errmsg = "";
1495 int inShift = 0;
1496 unsigned int bitsleft = 0;
1497 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 int surrogate = 0;
1499 PyObject *errorHandler = NULL;
1500 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001501
1502 unicode = _PyUnicode_New(size);
1503 if (!unicode)
1504 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001505 if (size == 0) {
1506 if (consumed)
1507 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001509 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510
1511 p = unicode->str;
1512 e = s + size;
1513
1514 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001515 Py_UNICODE ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001516 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001517 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518
1519 if (inShift) {
1520 if ((ch == '-') || !B64CHAR(ch)) {
1521 inShift = 0;
1522 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001523
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1525 if (bitsleft >= 6) {
1526 /* The shift sequence has a partial character in it. If
1527 bitsleft < 6 then we could just classify it as padding
1528 but that is not the case here */
1529
1530 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001531 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 }
1533 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001534 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 here so indicate the potential of a misencoded character. */
1536
1537 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1538 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1539 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001540 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 }
1542
1543 if (ch == '-') {
1544 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001545 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 inShift = 1;
1547 }
1548 } else if (SPECIAL(ch,0,0)) {
1549 errmsg = "unexpected special character";
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001550 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 } else {
1552 *p++ = ch;
1553 }
1554 } else {
1555 charsleft = (charsleft << 6) | UB64(ch);
1556 bitsleft += 6;
1557 s++;
1558 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1559 }
1560 }
1561 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001562 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 s++;
1564 if (s < e && *s == '-') {
1565 s++;
1566 *p++ = '+';
1567 } else
1568 {
1569 inShift = 1;
1570 bitsleft = 0;
1571 }
1572 }
1573 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001574 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 errmsg = "unexpected special character";
1576 s++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001577 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 }
1579 else {
1580 *p++ = ch;
1581 s++;
1582 }
1583 continue;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001584 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 outpos = p-PyUnicode_AS_UNICODE(unicode);
1586 endinpos = s-starts;
1587 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001588 errors, &errorHandler,
1589 "utf7", errmsg,
1590 starts, size, &startinpos, &endinpos, &exc, &s,
1591 &unicode, &outpos, &p))
1592 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 }
1594
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001595 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 outpos = p-PyUnicode_AS_UNICODE(unicode);
1597 endinpos = size;
1598 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001599 errors, &errorHandler,
1600 "utf7", "unterminated shift sequence",
1601 starts, size, &startinpos, &endinpos, &exc, &s,
1602 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 if (s < e)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001605 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001607 if (consumed) {
1608 if(inShift)
1609 *consumed = startinpos;
1610 else
1611 *consumed = s-starts;
1612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001614 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615 goto onError;
1616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_XDECREF(errorHandler);
1618 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 return (PyObject *)unicode;
1620
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 Py_XDECREF(errorHandler);
1623 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 Py_DECREF(unicode);
1625 return NULL;
1626}
1627
1628
1629PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001630 Py_ssize_t size,
1631 int encodeSetO,
1632 int encodeWhiteSpace,
1633 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634{
1635 PyObject *v;
1636 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001637 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 unsigned int bitsleft = 0;
1641 unsigned long charsleft = 0;
1642 char * out;
1643 char * start;
1644
Neal Norwitze7d8be82008-07-31 17:17:14 +00001645 if (cbAllocated / 5 != size)
1646 return PyErr_NoMemory();
1647
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 if (size == 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001649 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001651 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 if (v == NULL)
1653 return NULL;
1654
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001655 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 for (;i < size; ++i) {
1657 Py_UNICODE ch = s[i];
1658
1659 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001660 if (ch == '+') {
1661 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 *out++ = '-';
1663 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1664 charsleft = ch;
1665 bitsleft = 16;
1666 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001667 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 } else {
1670 *out++ = (char) ch;
1671 }
1672 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1674 *out++ = B64(charsleft << (6-bitsleft));
1675 charsleft = 0;
1676 bitsleft = 0;
1677 /* Characters not in the BASE64 set implicitly unshift the sequence
1678 so no '-' is required, except if the character is itself a '-' */
1679 if (B64CHAR(ch) || ch == '-') {
1680 *out++ = '-';
1681 }
1682 inShift = 0;
1683 *out++ = (char) ch;
1684 } else {
1685 bitsleft += 16;
1686 charsleft = (charsleft << 16) | ch;
1687 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1688
Jesus Cea585ad8a2009-07-02 15:37:21 +00001689 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001690 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 or '-' then the shift sequence will be terminated implicitly and we
1692 don't have to insert a '-'. */
1693
1694 if (bitsleft == 0) {
1695 if (i + 1 < size) {
1696 Py_UNICODE ch2 = s[i+1];
1697
1698 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001699
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 } else if (B64CHAR(ch2) || ch2 == '-') {
1701 *out++ = '-';
1702 inShift = 0;
1703 } else {
1704 inShift = 0;
1705 }
1706
1707 }
1708 else {
1709 *out++ = '-';
1710 inShift = 0;
1711 }
1712 }
Tim Petersced69f82003-09-16 20:30:58 +00001713 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 if (bitsleft) {
1717 *out++= B64(charsleft << (6-bitsleft) );
1718 *out++ = '-';
1719 }
1720
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001721 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 return v;
1723}
1724
1725#undef SPECIAL
1726#undef B64
1727#undef B64CHAR
1728#undef UB64
1729#undef ENCODE
1730#undef DECODE
1731
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732/* --- UTF-8 Codec -------------------------------------------------------- */
1733
Tim Petersced69f82003-09-16 20:30:58 +00001734static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735char utf8_code_length[256] = {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001736 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1737 illegal prefix. See RFC 3629 for details */
1738 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1750 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1751 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1752 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1753 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754};
1755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001757 Py_ssize_t size,
1758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759{
Walter Dörwald69652032004-09-07 20:24:22 +00001760 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1761}
1762
1763PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001764 Py_ssize_t size,
1765 const char *errors,
1766 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 int n;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001770 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771 Py_ssize_t startinpos;
1772 Py_ssize_t endinpos;
1773 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 const char *e;
1775 PyUnicodeObject *unicode;
1776 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 PyObject *errorHandler = NULL;
1779 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
1781 /* Note: size will always be longer than the resulting Unicode
1782 character count */
1783 unicode = _PyUnicode_New(size);
1784 if (!unicode)
1785 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001786 if (size == 0) {
1787 if (consumed)
1788 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 /* Unpack UTF-8 encoded data */
1793 p = unicode->str;
1794 e = s + size;
1795
1796 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001797 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
1799 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001800 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 s++;
1802 continue;
1803 }
1804
1805 n = utf8_code_length[ch];
1806
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 if (s + n > e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001808 if (consumed)
1809 break;
1810 else {
1811 errmsg = "unexpected end of data";
1812 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001813 endinpos = startinpos+1;
1814 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1815 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001816 goto utf8Error;
1817 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 switch (n) {
1821
1822 case 0:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001823 errmsg = "invalid start byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001824 startinpos = s-starts;
1825 endinpos = startinpos+1;
1826 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 errmsg = "internal error";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001830 startinpos = s-starts;
1831 endinpos = startinpos+1;
1832 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833
1834 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001835 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001836 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001837 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001838 endinpos = startinpos + 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001839 goto utf8Error;
1840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001842 assert ((ch > 0x007F) && (ch <= 0x07FF));
1843 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 break;
1845
1846 case 3:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001847 /* XXX: surrogates shouldn't be valid UTF-8!
1848 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1850 Uncomment the 2 lines below to make them invalid,
1851 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001852 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001853 (s[2] & 0xc0) != 0x80 ||
1854 ((unsigned char)s[0] == 0xE0 &&
1855 (unsigned char)s[1] < 0xA0)/* ||
1856 ((unsigned char)s[0] == 0xED &&
1857 (unsigned char)s[1] > 0x9F)*/) {
1858 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001859 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001860 endinpos = startinpos + 1;
1861
1862 /* if s[1] first two bits are 1 and 0, then the invalid
1863 continuation byte is s[2], so increment endinpos by 1,
1864 if not, s[1] is invalid and endinpos doesn't need to
1865 be incremented. */
1866 if ((s[1] & 0xC0) == 0x80)
1867 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001868 goto utf8Error;
1869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001871 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1872 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001873 break;
1874
1875 case 4:
1876 if ((s[1] & 0xc0) != 0x80 ||
1877 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001878 (s[3] & 0xc0) != 0x80 ||
1879 ((unsigned char)s[0] == 0xF0 &&
1880 (unsigned char)s[1] < 0x90) ||
1881 ((unsigned char)s[0] == 0xF4 &&
1882 (unsigned char)s[1] > 0x8F)) {
1883 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001884 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001885 endinpos = startinpos + 1;
1886 if ((s[1] & 0xC0) == 0x80) {
1887 endinpos++;
1888 if ((s[2] & 0xC0) == 0x80)
1889 endinpos++;
1890 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001891 goto utf8Error;
1892 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti86e5e172010-07-03 05:34:39 +00001894 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1895 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
1896
Fredrik Lundh8f455852001-06-27 18:59:43 +00001897#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001898 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001901
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 /* translate from 10000..10FFFF to 0..FFFF */
1903 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001904
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001905 /* high surrogate = top 10 bits added to D800 */
1906 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001907
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001909 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 }
1913 s += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001914 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001915
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001916 utf8Error:
1917 outpos = p-PyUnicode_AS_UNICODE(unicode);
1918 if (unicode_decode_call_errorhandler(
1919 errors, &errorHandler,
1920 "utf8", errmsg,
1921 starts, size, &startinpos, &endinpos, &exc, &s,
1922 &unicode, &outpos, &p))
1923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 }
Walter Dörwald69652032004-09-07 20:24:22 +00001925 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001926 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
1928 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001929 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 goto onError;
1931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 Py_XDECREF(errorHandler);
1933 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 return (PyObject *)unicode;
1935
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001936 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 Py_DECREF(unicode);
1940 return NULL;
1941}
1942
Tim Peters602f7402002-04-27 18:03:26 +00001943/* Allocation strategy: if the string is short, convert into a stack buffer
1944 and allocate exactly as much space needed at the end. Else allocate the
1945 maximum possible needed (4 result bytes per Unicode character), and return
1946 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001947*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001948PyObject *
1949PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001950 Py_ssize_t size,
1951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952{
Tim Peters602f7402002-04-27 18:03:26 +00001953#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001954
Martin v. Löwis18e16552006-02-15 17:27:45 +00001955 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001956 PyObject *v; /* result string object */
1957 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001958 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001959 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001960 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001961
Tim Peters602f7402002-04-27 18:03:26 +00001962 assert(s != NULL);
1963 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964
Tim Peters602f7402002-04-27 18:03:26 +00001965 if (size <= MAX_SHORT_UNICHARS) {
1966 /* Write into the stack buffer; nallocated can't overflow.
1967 * At the end, we'll allocate exactly as much heap space as it
1968 * turns out we need.
1969 */
1970 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1971 v = NULL; /* will allocate after we're done */
1972 p = stackbuf;
1973 }
1974 else {
1975 /* Overallocate on the heap, and give the excess back at the end. */
1976 nallocated = size * 4;
1977 if (nallocated / 4 != size) /* overflow! */
1978 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001979 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001980 if (v == NULL)
1981 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001982 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001983 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001984
Tim Peters602f7402002-04-27 18:03:26 +00001985 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001986 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001987
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001988 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001989 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001993 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001994 *p++ = (char)(0xc0 | (ch >> 6));
1995 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001996 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001997 else {
Tim Peters602f7402002-04-27 18:03:26 +00001998 /* Encode UCS2 Unicode ordinals */
1999 if (ch < 0x10000) {
2000 /* Special case: check for high surrogate */
2001 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2002 Py_UCS4 ch2 = s[i];
2003 /* Check for low surrogate and combine the two to
2004 form a UCS4 value */
2005 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002006 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002007 i++;
2008 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 }
Tim Peters602f7402002-04-27 18:03:26 +00002010 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002013 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2014 *p++ = (char)(0x80 | (ch & 0x3f));
2015 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002016 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002017 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002018 /* Encode UCS4 Unicode ordinals */
2019 *p++ = (char)(0xf0 | (ch >> 18));
2020 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2021 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2022 *p++ = (char)(0x80 | (ch & 0x3f));
2023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002025
Tim Peters602f7402002-04-27 18:03:26 +00002026 if (v == NULL) {
2027 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002028 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002029 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002030 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002031 }
2032 else {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002033 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002034 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002035 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002036 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002039
Tim Peters602f7402002-04-27 18:03:26 +00002040#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041}
2042
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2044{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (!PyUnicode_Check(unicode)) {
2046 PyErr_BadArgument();
2047 return NULL;
2048 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002049 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002050 PyUnicode_GET_SIZE(unicode),
2051 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Walter Dörwald6e390802007-08-17 16:41:28 +00002054/* --- UTF-32 Codec ------------------------------------------------------- */
2055
2056PyObject *
2057PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002058 Py_ssize_t size,
2059 const char *errors,
2060 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002061{
2062 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2063}
2064
2065PyObject *
2066PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002067 Py_ssize_t size,
2068 const char *errors,
2069 int *byteorder,
2070 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002071{
2072 const char *starts = s;
2073 Py_ssize_t startinpos;
2074 Py_ssize_t endinpos;
2075 Py_ssize_t outpos;
2076 PyUnicodeObject *unicode;
2077 Py_UNICODE *p;
2078#ifndef Py_UNICODE_WIDE
Antoine Pitrou4595e512010-06-11 21:48:02 +00002079 int pairs = 0;
Walter Dörwald6e390802007-08-17 16:41:28 +00002080#else
2081 const int pairs = 0;
2082#endif
Antoine Pitrou4595e512010-06-11 21:48:02 +00002083 const unsigned char *q, *e, *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002084 int bo = 0; /* assume native ordering by default */
2085 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002086 /* Offsets from q for retrieving bytes in the right order. */
2087#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2088 int iorder[] = {0, 1, 2, 3};
2089#else
2090 int iorder[] = {3, 2, 1, 0};
2091#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002092 PyObject *errorHandler = NULL;
2093 PyObject *exc = NULL;
Antoine Pitrou4595e512010-06-11 21:48:02 +00002094
Walter Dörwald6e390802007-08-17 16:41:28 +00002095 q = (unsigned char *)s;
2096 e = q + size;
2097
2098 if (byteorder)
2099 bo = *byteorder;
2100
2101 /* Check for BOM marks (U+FEFF) in the input and adjust current
2102 byte order setting accordingly. In native mode, the leading BOM
2103 mark is skipped, in all other modes, it is copied to the output
2104 stream as-is (giving a ZWNBSP character). */
2105 if (bo == 0) {
2106 if (size >= 4) {
2107 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002108 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002110 if (bom == 0x0000FEFF) {
2111 q += 4;
2112 bo = -1;
2113 }
2114 else if (bom == 0xFFFE0000) {
2115 q += 4;
2116 bo = 1;
2117 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002118#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002119 if (bom == 0x0000FEFF) {
2120 q += 4;
2121 bo = 1;
2122 }
2123 else if (bom == 0xFFFE0000) {
2124 q += 4;
2125 bo = -1;
2126 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002127#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002128 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002129 }
2130
2131 if (bo == -1) {
2132 /* force LE */
2133 iorder[0] = 0;
2134 iorder[1] = 1;
2135 iorder[2] = 2;
2136 iorder[3] = 3;
2137 }
2138 else if (bo == 1) {
2139 /* force BE */
2140 iorder[0] = 3;
2141 iorder[1] = 2;
2142 iorder[2] = 1;
2143 iorder[3] = 0;
2144 }
2145
Antoine Pitrou4595e512010-06-11 21:48:02 +00002146 /* On narrow builds we split characters outside the BMP into two
2147 codepoints => count how much extra space we need. */
2148#ifndef Py_UNICODE_WIDE
2149 for (qq = q; qq < e; qq += 4)
2150 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2151 pairs++;
2152#endif
2153
2154 /* This might be one to much, because of a BOM */
2155 unicode = _PyUnicode_New((size+3)/4+pairs);
2156 if (!unicode)
2157 return NULL;
2158 if (size == 0)
2159 return (PyObject *)unicode;
2160
2161 /* Unpack UTF-32 encoded data */
2162 p = unicode->str;
2163
Walter Dörwald6e390802007-08-17 16:41:28 +00002164 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002165 Py_UCS4 ch;
2166 /* remaining bytes at the end? (size should be divisible by 4) */
2167 if (e-q<4) {
2168 if (consumed)
2169 break;
2170 errmsg = "truncated data";
2171 startinpos = ((const char *)q)-starts;
2172 endinpos = ((const char *)e)-starts;
2173 goto utf32Error;
2174 /* The remaining input chars are ignored if the callback
2175 chooses to skip the input */
2176 }
2177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2178 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002179
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002180 if (ch >= 0x110000)
2181 {
2182 errmsg = "codepoint not in range(0x110000)";
2183 startinpos = ((const char *)q)-starts;
2184 endinpos = startinpos+4;
2185 goto utf32Error;
2186 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002187#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002188 if (ch >= 0x10000)
2189 {
2190 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2191 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2192 }
2193 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002194#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002195 *p++ = ch;
2196 q += 4;
2197 continue;
2198 utf32Error:
2199 outpos = p-PyUnicode_AS_UNICODE(unicode);
2200 if (unicode_decode_call_errorhandler(
2201 errors, &errorHandler,
2202 "utf32", errmsg,
Georg Brandlf7a09be2009-09-17 11:33:31 +00002203 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002204 &unicode, &outpos, &p))
2205 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002206 }
2207
2208 if (byteorder)
2209 *byteorder = bo;
2210
2211 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002212 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002213
2214 /* Adjust length */
2215 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2216 goto onError;
2217
2218 Py_XDECREF(errorHandler);
2219 Py_XDECREF(exc);
2220 return (PyObject *)unicode;
2221
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002222 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002223 Py_DECREF(unicode);
2224 Py_XDECREF(errorHandler);
2225 Py_XDECREF(exc);
2226 return NULL;
2227}
2228
2229PyObject *
2230PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002231 Py_ssize_t size,
2232 const char *errors,
2233 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002234{
2235 PyObject *v;
2236 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002237 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002238#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002239 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002240#else
2241 const int pairs = 0;
2242#endif
2243 /* Offsets from p for storing byte pairs in the right order. */
2244#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2245 int iorder[] = {0, 1, 2, 3};
2246#else
2247 int iorder[] = {3, 2, 1, 0};
2248#endif
2249
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002250#define STORECHAR(CH) \
2251 do { \
2252 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2253 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2254 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2255 p[iorder[0]] = (CH) & 0xff; \
2256 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002257 } while(0)
2258
2259 /* In narrow builds we can output surrogate pairs as one codepoint,
2260 so we need less space. */
2261#ifndef Py_UNICODE_WIDE
2262 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002263 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2264 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2265 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002266#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002267 nsize = (size - pairs + (byteorder == 0));
2268 bytesize = nsize * 4;
2269 if (bytesize / 4 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002270 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002271 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002272 if (v == NULL)
2273 return NULL;
2274
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002275 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002276 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002277 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 if (size == 0)
2279 return v;
2280
2281 if (byteorder == -1) {
2282 /* force LE */
2283 iorder[0] = 0;
2284 iorder[1] = 1;
2285 iorder[2] = 2;
2286 iorder[3] = 3;
2287 }
2288 else if (byteorder == 1) {
2289 /* force BE */
2290 iorder[0] = 3;
2291 iorder[1] = 2;
2292 iorder[2] = 1;
2293 iorder[3] = 0;
2294 }
2295
2296 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002297 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002298#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002299 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2300 Py_UCS4 ch2 = *s;
2301 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2302 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2303 s++;
2304 size--;
2305 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002306 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002307#endif
2308 STORECHAR(ch);
2309 }
2310 return v;
2311#undef STORECHAR
2312}
2313
2314PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2315{
2316 if (!PyUnicode_Check(unicode)) {
2317 PyErr_BadArgument();
2318 return NULL;
2319 }
2320 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002321 PyUnicode_GET_SIZE(unicode),
2322 NULL,
2323 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002324}
2325
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326/* --- UTF-16 Codec ------------------------------------------------------- */
2327
Tim Peters772747b2001-08-09 22:21:55 +00002328PyObject *
2329PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002330 Py_ssize_t size,
2331 const char *errors,
2332 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333{
Walter Dörwald69652032004-09-07 20:24:22 +00002334 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2335}
2336
2337PyObject *
2338PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002339 Py_ssize_t size,
2340 const char *errors,
2341 int *byteorder,
2342 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002344 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002345 Py_ssize_t startinpos;
2346 Py_ssize_t endinpos;
2347 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 PyUnicodeObject *unicode;
2349 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002350 const unsigned char *q, *e;
2351 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002352 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002353 /* Offsets from q for retrieving byte pairs in the right order. */
2354#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2355 int ihi = 1, ilo = 0;
2356#else
2357 int ihi = 0, ilo = 1;
2358#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 PyObject *errorHandler = NULL;
2360 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361
2362 /* Note: size will always be longer than the resulting Unicode
2363 character count */
2364 unicode = _PyUnicode_New(size);
2365 if (!unicode)
2366 return NULL;
2367 if (size == 0)
2368 return (PyObject *)unicode;
2369
2370 /* Unpack UTF-16 encoded data */
2371 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002372 q = (unsigned char *)s;
2373 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002376 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002378 /* Check for BOM marks (U+FEFF) in the input and adjust current
2379 byte order setting accordingly. In native mode, the leading BOM
2380 mark is skipped, in all other modes, it is copied to the output
2381 stream as-is (giving a ZWNBSP character). */
2382 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002383 if (size >= 2) {
2384 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002386 if (bom == 0xFEFF) {
2387 q += 2;
2388 bo = -1;
2389 }
2390 else if (bom == 0xFFFE) {
2391 q += 2;
2392 bo = 1;
2393 }
Tim Petersced69f82003-09-16 20:30:58 +00002394#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002395 if (bom == 0xFEFF) {
2396 q += 2;
2397 bo = 1;
2398 }
2399 else if (bom == 0xFFFE) {
2400 q += 2;
2401 bo = -1;
2402 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002403#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406
Tim Peters772747b2001-08-09 22:21:55 +00002407 if (bo == -1) {
2408 /* force LE */
2409 ihi = 1;
2410 ilo = 0;
2411 }
2412 else if (bo == 1) {
2413 /* force BE */
2414 ihi = 0;
2415 ilo = 1;
2416 }
2417
2418 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002419 Py_UNICODE ch;
2420 /* remaining bytes at the end? (size should be even) */
2421 if (e-q<2) {
2422 if (consumed)
2423 break;
2424 errmsg = "truncated data";
2425 startinpos = ((const char *)q)-starts;
2426 endinpos = ((const char *)e)-starts;
2427 goto utf16Error;
2428 /* The remaining input chars are ignored if the callback
2429 chooses to skip the input */
2430 }
2431 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002433 q += 2;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002434
2435 if (ch < 0xD800 || ch > 0xDFFF) {
2436 *p++ = ch;
2437 continue;
2438 }
2439
2440 /* UTF-16 code pair: */
2441 if (q >= e) {
2442 errmsg = "unexpected end of data";
2443 startinpos = (((const char *)q)-2)-starts;
2444 endinpos = ((const char *)e)-starts;
2445 goto utf16Error;
2446 }
2447 if (0xD800 <= ch && ch <= 0xDBFF) {
2448 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2449 q += 2;
2450 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002451#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002452 *p++ = ch;
2453 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002455 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002456#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002457 continue;
2458 }
2459 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002461 startinpos = (((const char *)q)-4)-starts;
2462 endinpos = startinpos+2;
2463 goto utf16Error;
2464 }
2465
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002466 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002467 errmsg = "illegal encoding";
2468 startinpos = (((const char *)q)-2)-starts;
2469 endinpos = startinpos+2;
2470 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002471
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002472 utf16Error:
2473 outpos = p-PyUnicode_AS_UNICODE(unicode);
2474 if (unicode_decode_call_errorhandler(
2475 errors, &errorHandler,
2476 "utf16", errmsg,
2477 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2478 &unicode, &outpos, &p))
2479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 }
2481
2482 if (byteorder)
2483 *byteorder = bo;
2484
Walter Dörwald69652032004-09-07 20:24:22 +00002485 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002486 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002487
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002489 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 goto onError;
2491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 Py_XDECREF(errorHandler);
2493 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 return (PyObject *)unicode;
2495
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return NULL;
2501}
2502
Tim Peters772747b2001-08-09 22:21:55 +00002503PyObject *
2504PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002505 Py_ssize_t size,
2506 const char *errors,
2507 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508{
2509 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002510 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002511 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002512#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002513 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002514#else
2515 const int pairs = 0;
2516#endif
Tim Peters772747b2001-08-09 22:21:55 +00002517 /* Offsets from p for storing byte pairs in the right order. */
2518#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2519 int ihi = 1, ilo = 0;
2520#else
2521 int ihi = 0, ilo = 1;
2522#endif
2523
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002524#define STORECHAR(CH) \
2525 do { \
2526 p[ihi] = ((CH) >> 8) & 0xff; \
2527 p[ilo] = (CH) & 0xff; \
2528 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002529 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002531#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002532 for (i = pairs = 0; i < size; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002533 if (s[i] >= 0x10000)
2534 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002536 /* 2 * (size + pairs + (byteorder == 0)) */
2537 if (size > PY_SSIZE_T_MAX ||
2538 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002539 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002540 nsize = size + pairs + (byteorder == 0);
2541 bytesize = nsize * 2;
2542 if (bytesize / 2 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002543 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002544 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 if (v == NULL)
2546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002548 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002550 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002551 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002552 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002553
2554 if (byteorder == -1) {
2555 /* force LE */
2556 ihi = 1;
2557 ilo = 0;
2558 }
2559 else if (byteorder == 1) {
2560 /* force BE */
2561 ihi = 0;
2562 ilo = 1;
2563 }
2564
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002566 Py_UNICODE ch = *s++;
2567 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002568#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002569 if (ch >= 0x10000) {
2570 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2571 ch = 0xD800 | ((ch-0x10000) >> 10);
2572 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002573#endif
Tim Peters772747b2001-08-09 22:21:55 +00002574 STORECHAR(ch);
2575 if (ch2)
2576 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002579#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580}
2581
2582PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2583{
2584 if (!PyUnicode_Check(unicode)) {
2585 PyErr_BadArgument();
2586 return NULL;
2587 }
2588 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002589 PyUnicode_GET_SIZE(unicode),
2590 NULL,
2591 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592}
2593
2594/* --- Unicode Escape Codec ----------------------------------------------- */
2595
Fredrik Lundh06d12682001-01-24 07:59:11 +00002596static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002597
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002599 Py_ssize_t size,
2600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002603 Py_ssize_t startinpos;
2604 Py_ssize_t endinpos;
2605 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002610 char* message;
2611 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 PyObject *errorHandler = NULL;
2613 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002614
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 /* Escaped strings will always be longer than the resulting
2616 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 length after conversion to the true value.
2618 (but if the error callback returns a long replacement string
2619 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 v = _PyUnicode_New(size);
2621 if (v == NULL)
2622 goto onError;
2623 if (size == 0)
2624 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 while (s < end) {
2630 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002631 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002632 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633
2634 /* Non-escape characters are interpreted as Unicode ordinals */
2635 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002636 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 continue;
2638 }
2639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 /* \ - Escapes */
2642 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002643 c = *s++;
2644 if (s > end)
2645 c = '\0'; /* Invalid after \ */
2646 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002648 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 case '\n': break;
2650 case '\\': *p++ = '\\'; break;
2651 case '\'': *p++ = '\''; break;
2652 case '\"': *p++ = '\"'; break;
2653 case 'b': *p++ = '\b'; break;
2654 case 'f': *p++ = '\014'; break; /* FF */
2655 case 't': *p++ = '\t'; break;
2656 case 'n': *p++ = '\n'; break;
2657 case 'r': *p++ = '\r'; break;
2658 case 'v': *p++ = '\013'; break; /* VT */
2659 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2660
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002661 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 case '0': case '1': case '2': case '3':
2663 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002664 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002665 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002666 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002667 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002668 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002670 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 break;
2672
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002673 /* hex escapes */
2674 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 digits = 2;
2677 message = "truncated \\xXX escape";
2678 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002680 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 digits = 4;
2683 message = "truncated \\uXXXX escape";
2684 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002686 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002687 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 digits = 8;
2689 message = "truncated \\UXXXXXXXX escape";
2690 hexescape:
2691 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 outpos = p-PyUnicode_AS_UNICODE(v);
2693 if (s+digits>end) {
2694 endinpos = size;
2695 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002696 errors, &errorHandler,
2697 "unicodeescape", "end of string in escape sequence",
2698 starts, size, &startinpos, &endinpos, &exc, &s,
2699 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 goto onError;
2701 goto nextByte;
2702 }
2703 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002704 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002705 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 endinpos = (s+i+1)-starts;
2707 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002708 errors, &errorHandler,
2709 "unicodeescape", message,
2710 starts, size, &startinpos, &endinpos, &exc, &s,
2711 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002712 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002714 }
2715 chr = (chr<<4) & ~0xF;
2716 if (c >= '0' && c <= '9')
2717 chr += c - '0';
2718 else if (c >= 'a' && c <= 'f')
2719 chr += 10 + c - 'a';
2720 else
2721 chr += 10 + c - 'A';
2722 }
2723 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002724 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 /* _decoding_error will have already written into the
2726 target buffer. */
2727 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002728 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002729 /* when we get here, chr is a 32-bit unicode character */
2730 if (chr <= 0xffff)
2731 /* UCS-2 character */
2732 *p++ = (Py_UNICODE) chr;
2733 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002734 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002735 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002736#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002737 *p++ = chr;
2738#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 chr -= 0x10000L;
2740 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002741 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002742#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002743 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 endinpos = s-starts;
2745 outpos = p-PyUnicode_AS_UNICODE(v);
2746 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002747 errors, &errorHandler,
2748 "unicodeescape", "illegal Unicode character",
2749 starts, size, &startinpos, &endinpos, &exc, &s,
2750 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002751 goto onError;
2752 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 break;
2754
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002755 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002756 case 'N':
2757 message = "malformed \\N character escape";
2758 if (ucnhash_CAPI == NULL) {
2759 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002760 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002761 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 if (m == NULL)
2763 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002764 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002766 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002768 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002769 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 if (ucnhash_CAPI == NULL)
2771 goto ucnhashError;
2772 }
2773 if (*s == '{') {
2774 const char *start = s+1;
2775 /* look for the closing brace */
2776 while (*s != '}' && s < end)
2777 s++;
2778 if (s > start && s < end && *s == '}') {
2779 /* found a name. look it up in the unicode database */
2780 message = "unknown Unicode character name";
2781 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002782 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002783 goto store;
2784 }
2785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 endinpos = s-starts;
2787 outpos = p-PyUnicode_AS_UNICODE(v);
2788 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002789 errors, &errorHandler,
2790 "unicodeescape", message,
2791 starts, size, &startinpos, &endinpos, &exc, &s,
2792 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 break;
2795
2796 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002797 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 message = "\\ at end of string";
2799 s--;
2800 endinpos = s-starts;
2801 outpos = p-PyUnicode_AS_UNICODE(v);
2802 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002803 errors, &errorHandler,
2804 "unicodeescape", message,
2805 starts, size, &startinpos, &endinpos, &exc, &s,
2806 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002807 goto onError;
2808 }
2809 else {
2810 *p++ = '\\';
2811 *p++ = (unsigned char)s[-1];
2812 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002813 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002815 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002818 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002823
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002824 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002825 PyErr_SetString(
2826 PyExc_UnicodeError,
2827 "\\N escapes not supported (can't load unicodedata module)"
2828 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002829 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 Py_XDECREF(errorHandler);
2831 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002832 return NULL;
2833
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002834 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 Py_XDECREF(errorHandler);
2837 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 return NULL;
2839}
2840
2841/* Return a Unicode-Escape string version of the Unicode object.
2842
2843 If quotes is true, the string is enclosed in u"" or u'' quotes as
2844 appropriate.
2845
2846*/
2847
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002848Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002849 Py_ssize_t size,
2850 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002851{
2852 /* like wcschr, but doesn't stop at NULL characters */
2853
2854 while (size-- > 0) {
2855 if (*s == ch)
2856 return s;
2857 s++;
2858 }
2859
2860 return NULL;
2861}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002862
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863static
2864PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 int quotes)
2867{
2868 PyObject *repr;
2869 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002871 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002872#ifdef Py_UNICODE_WIDE
2873 const Py_ssize_t expandsize = 10;
2874#else
2875 const Py_ssize_t expandsize = 6;
2876#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877
Neal Norwitz17753ec2006-08-21 22:21:19 +00002878 /* XXX(nnorwitz): rather than over-allocating, it would be
2879 better to choose a different scheme. Perhaps scan the
2880 first N-chars of the string and allocate based on that size.
2881 */
2882 /* Initial allocation is based on the longest-possible unichr
2883 escape.
2884
2885 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2886 unichr, so in this case it's the longest unichr escape. In
2887 narrow (UTF-16) builds this is five chars per source unichr
2888 since there are two unichrs in the surrogate pair, so in narrow
2889 (UTF-16) builds it's not the longest unichr escape.
2890
2891 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2892 so in the narrow (UTF-16) build case it's the longest unichr
2893 escape.
2894 */
2895
Neal Norwitze7d8be82008-07-31 17:17:14 +00002896 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002897 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002898
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002899 repr = PyString_FromStringAndSize(NULL,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002900 2
2901 + expandsize*size
2902 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 if (repr == NULL)
2904 return NULL;
2905
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002906 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907
2908 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002910 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 !findchar(s, size, '"')) ? '"' : '\'';
2912 }
2913 while (size-- > 0) {
2914 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002915
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002916 /* Escape quotes and backslashes */
2917 if ((quotes &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002918 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 *p++ = '\\';
2920 *p++ = (char) ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002921 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002922 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002923
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002924#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002925 /* Map 21-bit characters to '\U00xxxxxx' */
2926 else if (ch >= 0x10000) {
2927 *p++ = '\\';
2928 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2931 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2932 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2933 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2934 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2935 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002936 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002937 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002938 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002939#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002940 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2941 else if (ch >= 0xD800 && ch < 0xDC00) {
2942 Py_UNICODE ch2;
2943 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002944
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002945 ch2 = *s++;
2946 size--;
2947 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2948 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2949 *p++ = '\\';
2950 *p++ = 'U';
2951 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2953 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2954 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2955 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2956 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2957 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2958 *p++ = hexdigit[ucs & 0x0000000F];
2959 continue;
2960 }
2961 /* Fall through: isolated surrogates are copied as-is */
2962 s--;
2963 size++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002964 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002965#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 *p++ = '\\';
2970 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002971 *p++ = hexdigit[(ch >> 12) & 0x000F];
2972 *p++ = hexdigit[(ch >> 8) & 0x000F];
2973 *p++ = hexdigit[(ch >> 4) & 0x000F];
2974 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002976
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002977 /* Map special whitespace to '\t', \n', '\r' */
2978 else if (ch == '\t') {
2979 *p++ = '\\';
2980 *p++ = 't';
2981 }
2982 else if (ch == '\n') {
2983 *p++ = '\\';
2984 *p++ = 'n';
2985 }
2986 else if (ch == '\r') {
2987 *p++ = '\\';
2988 *p++ = 'r';
2989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002991 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002992 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002994 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002995 *p++ = hexdigit[(ch >> 4) & 0x000F];
2996 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002997 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002998
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 /* Copy everything else as-is */
3000 else
3001 *p++ = (char) ch;
3002 }
3003 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003004 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005
3006 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003007 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 return repr;
3009}
3010
3011PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003012 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013{
3014 return unicodeescape_string(s, size, 0);
3015}
3016
3017PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3018{
3019 if (!PyUnicode_Check(unicode)) {
3020 PyErr_BadArgument();
3021 return NULL;
3022 }
3023 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003024 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025}
3026
3027/* --- Raw Unicode Escape Codec ------------------------------------------- */
3028
3029PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003030 Py_ssize_t size,
3031 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003034 Py_ssize_t startinpos;
3035 Py_ssize_t endinpos;
3036 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 const char *end;
3040 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 PyObject *errorHandler = NULL;
3042 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003043
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 /* Escaped strings will always be longer than the resulting
3045 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 length after conversion to the true value. (But decoding error
3047 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 v = _PyUnicode_New(size);
3049 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003052 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 end = s + size;
3055 while (s < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003056 unsigned char c;
3057 Py_UCS4 x;
3058 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003059 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003061 /* Non-escape characters are interpreted as Unicode ordinals */
3062 if (*s != '\\') {
3063 *p++ = (unsigned char)*s++;
3064 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003065 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003066 startinpos = s-starts;
3067
3068 /* \u-escapes are only interpreted iff the number of leading
3069 backslashes if odd */
3070 bs = s;
3071 for (;s < end;) {
3072 if (*s != '\\')
3073 break;
3074 *p++ = (unsigned char)*s++;
3075 }
3076 if (((s - bs) & 1) == 0 ||
3077 s >= end ||
3078 (*s != 'u' && *s != 'U')) {
3079 continue;
3080 }
3081 p--;
3082 count = *s=='u' ? 4 : 8;
3083 s++;
3084
3085 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3086 outpos = p-PyUnicode_AS_UNICODE(v);
3087 for (x = 0, i = 0; i < count; ++i, ++s) {
3088 c = (unsigned char)*s;
3089 if (!isxdigit(c)) {
3090 endinpos = s-starts;
3091 if (unicode_decode_call_errorhandler(
3092 errors, &errorHandler,
3093 "rawunicodeescape", "truncated \\uXXXX",
3094 starts, size, &startinpos, &endinpos, &exc, &s,
3095 &v, &outpos, &p))
3096 goto onError;
3097 goto nextByte;
3098 }
3099 x = (x<<4) & ~0xF;
3100 if (c >= '0' && c <= '9')
3101 x += c - '0';
3102 else if (c >= 'a' && c <= 'f')
3103 x += 10 + c - 'a';
3104 else
3105 x += 10 + c - 'A';
3106 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003107 if (x <= 0xffff)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003108 /* UCS-2 character */
3109 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003110 else if (x <= 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003111 /* UCS-4 character. Either store directly, or as
3112 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003113#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003114 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003115#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003116 x -= 0x10000L;
3117 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3118 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003119#endif
3120 } else {
3121 endinpos = s-starts;
3122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003123 if (unicode_decode_call_errorhandler(
3124 errors, &errorHandler,
3125 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003126 starts, size, &startinpos, &endinpos, &exc, &s,
3127 &v, &outpos, &p))
3128 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003129 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003130 nextByte:
3131 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003133 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 Py_XDECREF(errorHandler);
3136 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 Py_XDECREF(errorHandler);
3142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 return NULL;
3144}
3145
3146PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003147 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148{
3149 PyObject *repr;
3150 char *p;
3151 char *q;
3152
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003153 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003155 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003157 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003159
Neal Norwitze7d8be82008-07-31 17:17:14 +00003160 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003161 return PyErr_NoMemory();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003162
Neal Norwitze7d8be82008-07-31 17:17:14 +00003163 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 if (repr == NULL)
3165 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003166 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003167 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003169 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 while (size-- > 0) {
3171 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003172#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003173 /* Map 32-bit characters to '\Uxxxxxxxx' */
3174 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175 *p++ = '\\';
3176 *p++ = 'U';
3177 *p++ = hexdigit[(ch >> 28) & 0xf];
3178 *p++ = hexdigit[(ch >> 24) & 0xf];
3179 *p++ = hexdigit[(ch >> 20) & 0xf];
3180 *p++ = hexdigit[(ch >> 16) & 0xf];
3181 *p++ = hexdigit[(ch >> 12) & 0xf];
3182 *p++ = hexdigit[(ch >> 8) & 0xf];
3183 *p++ = hexdigit[(ch >> 4) & 0xf];
3184 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003185 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003186 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003187#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003188 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3189 if (ch >= 0xD800 && ch < 0xDC00) {
3190 Py_UNICODE ch2;
3191 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003192
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003193 ch2 = *s++;
3194 size--;
3195 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3196 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3197 *p++ = '\\';
3198 *p++ = 'U';
3199 *p++ = hexdigit[(ucs >> 28) & 0xf];
3200 *p++ = hexdigit[(ucs >> 24) & 0xf];
3201 *p++ = hexdigit[(ucs >> 20) & 0xf];
3202 *p++ = hexdigit[(ucs >> 16) & 0xf];
3203 *p++ = hexdigit[(ucs >> 12) & 0xf];
3204 *p++ = hexdigit[(ucs >> 8) & 0xf];
3205 *p++ = hexdigit[(ucs >> 4) & 0xf];
3206 *p++ = hexdigit[ucs & 0xf];
3207 continue;
3208 }
3209 /* Fall through: isolated surrogates are copied as-is */
3210 s--;
3211 size++;
3212 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003213#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003214 /* Map 16-bit characters to '\uxxxx' */
3215 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 *p++ = '\\';
3217 *p++ = 'u';
3218 *p++ = hexdigit[(ch >> 12) & 0xf];
3219 *p++ = hexdigit[(ch >> 8) & 0xf];
3220 *p++ = hexdigit[(ch >> 4) & 0xf];
3221 *p++ = hexdigit[ch & 15];
3222 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003223 /* Copy everything else as-is */
3224 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 *p++ = (char) ch;
3226 }
3227 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003228 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 return repr;
3230}
3231
3232PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3233{
3234 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003235 PyErr_BadArgument();
3236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
3238 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003239 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240}
3241
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003242/* --- Unicode Internal Codec ------------------------------------------- */
3243
3244PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003245 Py_ssize_t size,
3246 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003247{
3248 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003249 Py_ssize_t startinpos;
3250 Py_ssize_t endinpos;
3251 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003252 PyUnicodeObject *v;
3253 Py_UNICODE *p;
3254 const char *end;
3255 const char *reason;
3256 PyObject *errorHandler = NULL;
3257 PyObject *exc = NULL;
3258
Neal Norwitzd43069c2006-01-08 01:12:10 +00003259#ifdef Py_UNICODE_WIDE
3260 Py_UNICODE unimax = PyUnicode_GetMax();
3261#endif
3262
Armin Rigo7ccbca92006-10-04 12:17:45 +00003263 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003264 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3265 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003266 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003267 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003268 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003269 p = PyUnicode_AS_UNICODE(v);
3270 end = s + size;
3271
3272 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003273 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003274 /* We have to sanity check the raw data, otherwise doom looms for
3275 some malformed UCS-4 data. */
3276 if (
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003277#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003278 *p > unimax || *p < 0 ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003279#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003280 end-s < Py_UNICODE_SIZE
3281 )
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003282 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003283 startinpos = s - starts;
3284 if (end-s < Py_UNICODE_SIZE) {
3285 endinpos = end-starts;
3286 reason = "truncated input";
3287 }
3288 else {
3289 endinpos = s - starts + Py_UNICODE_SIZE;
3290 reason = "illegal code point (> 0x10FFFF)";
3291 }
3292 outpos = p - PyUnicode_AS_UNICODE(v);
3293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "unicode_internal", reason,
3296 starts, size, &startinpos, &endinpos, &exc, &s,
Benjamin Peterson828a7062008-12-27 17:05:29 +00003297 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003298 goto onError;
3299 }
3300 }
3301 else {
3302 p++;
3303 s += Py_UNICODE_SIZE;
3304 }
3305 }
3306
Martin v. Löwis412fb672006-04-13 06:34:32 +00003307 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003308 goto onError;
3309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
3311 return (PyObject *)v;
3312
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003313 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003314 Py_XDECREF(v);
3315 Py_XDECREF(errorHandler);
3316 Py_XDECREF(exc);
3317 return NULL;
3318}
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320/* --- Latin-1 Codec ------------------------------------------------------ */
3321
3322PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003323 Py_ssize_t size,
3324 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325{
3326 PyUnicodeObject *v;
3327 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003328
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003330 if (size == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003331 Py_UNICODE r = *(unsigned char*)s;
3332 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003333 }
3334
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 v = _PyUnicode_New(size);
3336 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003337 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003339 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 p = PyUnicode_AS_UNICODE(v);
3341 while (size-- > 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003342 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003344
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003345 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 Py_XDECREF(v);
3347 return NULL;
3348}
3349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350/* create or adjust a UnicodeEncodeError */
3351static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003352 const char *encoding,
3353 const Py_UNICODE *unicode, Py_ssize_t size,
3354 Py_ssize_t startpos, Py_ssize_t endpos,
3355 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357 if (*exceptionObject == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003358 *exceptionObject = PyUnicodeEncodeError_Create(
3359 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003362 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3363 goto onError;
3364 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3365 goto onError;
3366 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3367 goto onError;
3368 return;
3369 onError:
3370 Py_DECREF(*exceptionObject);
3371 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 }
3373}
3374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375/* raises a UnicodeEncodeError */
3376static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003377 const char *encoding,
3378 const Py_UNICODE *unicode, Py_ssize_t size,
3379 Py_ssize_t startpos, Py_ssize_t endpos,
3380 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381{
3382 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003383 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003385 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386}
3387
3388/* error handling callback helper:
3389 build arguments, call the callback and check the arguments,
3390 put the result into newpos and return the replacement string, which
3391 has to be freed by the caller */
3392static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003393 PyObject **errorHandler,
3394 const char *encoding, const char *reason,
3395 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3396 Py_ssize_t startpos, Py_ssize_t endpos,
3397 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003399 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400
3401 PyObject *restuple;
3402 PyObject *resunicode;
3403
3404 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003405 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 }
3409
3410 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003411 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414
3415 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003416 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00003420 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003421 Py_DECREF(restuple);
3422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 }
3424 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003425 &resunicode, newpos)) {
3426 Py_DECREF(restuple);
3427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 }
3429 if (*newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003430 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003431 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003432 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3433 Py_DECREF(restuple);
3434 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 Py_INCREF(resunicode);
3437 Py_DECREF(restuple);
3438 return resunicode;
3439}
3440
3441static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003442 Py_ssize_t size,
3443 const char *errors,
3444 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445{
3446 /* output object */
3447 PyObject *res;
3448 /* pointers to the beginning and end+1 of input */
3449 const Py_UNICODE *startp = p;
3450 const Py_UNICODE *endp = p + size;
3451 /* pointer to the beginning of the unencodable characters */
3452 /* const Py_UNICODE *badp = NULL; */
3453 /* pointer into the output */
3454 char *str;
3455 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003456 Py_ssize_t respos = 0;
3457 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003458 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3459 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 PyObject *errorHandler = NULL;
3461 PyObject *exc = NULL;
3462 /* the following variable is used for caching string comparisons
3463 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3464 int known_errorHandler = -1;
3465
3466 /* allocate enough for a simple encoding without
3467 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003468 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 if (res == NULL)
3470 goto onError;
3471 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003472 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003473 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 ressize = size;
3475
3476 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003477 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003479 /* can we encode this? */
3480 if (c<limit) {
3481 /* no overflow check, because we know that the space is enough */
3482 *str++ = (char)c;
3483 ++p;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003484 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003485 else {
3486 Py_ssize_t unicodepos = p-startp;
3487 Py_ssize_t requiredsize;
3488 PyObject *repunicode;
3489 Py_ssize_t repsize;
3490 Py_ssize_t newpos;
3491 Py_ssize_t respos;
3492 Py_UNICODE *uni2;
3493 /* startpos for collecting unencodable chars */
3494 const Py_UNICODE *collstart = p;
3495 const Py_UNICODE *collend = p;
3496 /* find all unecodable characters */
3497 while ((collend < endp) && ((*collend)>=limit))
3498 ++collend;
3499 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3500 if (known_errorHandler==-1) {
3501 if ((errors==NULL) || (!strcmp(errors, "strict")))
3502 known_errorHandler = 1;
3503 else if (!strcmp(errors, "replace"))
3504 known_errorHandler = 2;
3505 else if (!strcmp(errors, "ignore"))
3506 known_errorHandler = 3;
3507 else if (!strcmp(errors, "xmlcharrefreplace"))
3508 known_errorHandler = 4;
3509 else
3510 known_errorHandler = 0;
3511 }
3512 switch (known_errorHandler) {
3513 case 1: /* strict */
3514 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3515 goto onError;
3516 case 2: /* replace */
3517 while (collstart++<collend)
3518 *str++ = '?'; /* fall through */
3519 case 3: /* ignore */
3520 p = collend;
3521 break;
3522 case 4: /* xmlcharrefreplace */
3523 respos = str-PyString_AS_STRING(res);
3524 /* determine replacement size (temporarily (mis)uses p) */
3525 for (p = collstart, repsize = 0; p < collend; ++p) {
3526 if (*p<10)
3527 repsize += 2+1+1;
3528 else if (*p<100)
3529 repsize += 2+2+1;
3530 else if (*p<1000)
3531 repsize += 2+3+1;
3532 else if (*p<10000)
3533 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003534#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003535 else
3536 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003537#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003538 else if (*p<100000)
3539 repsize += 2+5+1;
3540 else if (*p<1000000)
3541 repsize += 2+6+1;
3542 else
3543 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003544#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003545 }
3546 requiredsize = respos+repsize+(endp-collend);
3547 if (requiredsize > ressize) {
3548 if (requiredsize<2*ressize)
3549 requiredsize = 2*ressize;
3550 if (_PyString_Resize(&res, requiredsize))
3551 goto onError;
3552 str = PyString_AS_STRING(res) + respos;
3553 ressize = requiredsize;
3554 }
3555 /* generate replacement (temporarily (mis)uses p) */
3556 for (p = collstart; p < collend; ++p) {
3557 str += sprintf(str, "&#%d;", (int)*p);
3558 }
3559 p = collend;
3560 break;
3561 default:
3562 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3563 encoding, reason, startp, size, &exc,
3564 collstart-startp, collend-startp, &newpos);
3565 if (repunicode == NULL)
3566 goto onError;
3567 /* need more space? (at least enough for what we
3568 have+the replacement+the rest of the string, so
3569 we won't have to check space for encodable characters) */
3570 respos = str-PyString_AS_STRING(res);
3571 repsize = PyUnicode_GET_SIZE(repunicode);
3572 requiredsize = respos+repsize+(endp-collend);
3573 if (requiredsize > ressize) {
3574 if (requiredsize<2*ressize)
3575 requiredsize = 2*ressize;
3576 if (_PyString_Resize(&res, requiredsize)) {
3577 Py_DECREF(repunicode);
3578 goto onError;
3579 }
3580 str = PyString_AS_STRING(res) + respos;
3581 ressize = requiredsize;
3582 }
3583 /* check if there is anything unencodable in the replacement
3584 and copy it to the output */
3585 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3586 c = *uni2;
3587 if (c >= limit) {
3588 raise_encode_exception(&exc, encoding, startp, size,
3589 unicodepos, unicodepos+1, reason);
3590 Py_DECREF(repunicode);
3591 goto onError;
3592 }
3593 *str = (char)c;
3594 }
3595 p = startp + newpos;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003596 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003597 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003598 }
3599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003601 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 if (respos<ressize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003603 /* If this falls res will be NULL */
3604 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
3607 return res;
3608
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003609 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 Py_XDECREF(res);
3611 Py_XDECREF(errorHandler);
3612 Py_XDECREF(exc);
3613 return NULL;
3614}
3615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003617 Py_ssize_t size,
3618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621}
3622
3623PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3624{
3625 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003626 PyErr_BadArgument();
3627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 }
3629 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003630 PyUnicode_GET_SIZE(unicode),
3631 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632}
3633
3634/* --- 7-bit ASCII Codec -------------------------------------------------- */
3635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003637 Py_ssize_t size,
3638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 PyUnicodeObject *v;
3642 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003643 Py_ssize_t startinpos;
3644 Py_ssize_t endinpos;
3645 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 const char *e;
3647 PyObject *errorHandler = NULL;
3648 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003651 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003652 Py_UNICODE r = *(unsigned char*)s;
3653 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003654 }
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 v = _PyUnicode_New(size);
3657 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003660 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 e = s + size;
3663 while (s < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003664 register unsigned char c = (unsigned char)*s;
3665 if (c < 128) {
3666 *p++ = c;
3667 ++s;
3668 }
3669 else {
3670 startinpos = s-starts;
3671 endinpos = startinpos + 1;
3672 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3673 if (unicode_decode_call_errorhandler(
3674 errors, &errorHandler,
3675 "ascii", "ordinal not in range(128)",
3676 starts, size, &startinpos, &endinpos, &exc, &s,
3677 &v, &outpos, &p))
3678 goto onError;
3679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003681 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003682 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003687
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_XDECREF(errorHandler);
3691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 return NULL;
3693}
3694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003696 Py_ssize_t size,
3697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700}
3701
3702PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3703{
3704 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003705 PyErr_BadArgument();
3706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 }
3708 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003709 PyUnicode_GET_SIZE(unicode),
3710 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711}
3712
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003713#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003714
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003715/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003716
Hirokazu Yamamoto68e075e2009-03-21 13:04:41 +00003717#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003718#define NEED_RETRY
3719#endif
3720
3721/* XXX This code is limited to "true" double-byte encodings, as
3722 a) it assumes an incomplete character consists of a single byte, and
3723 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003724 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003725
3726static int is_dbcs_lead_byte(const char *s, int offset)
3727{
3728 const char *curr = s + offset;
3729
3730 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003731 const char *prev = CharPrev(s, curr);
3732 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003733 }
3734 return 0;
3735}
3736
3737/*
3738 * Decode MBCS string into unicode object. If 'final' is set, converts
3739 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3740 */
3741static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003742 const char *s, /* MBCS string */
3743 int size, /* sizeof MBCS string */
3744 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003745{
3746 Py_UNICODE *p;
3747 Py_ssize_t n = 0;
3748 int usize = 0;
3749
3750 assert(size >= 0);
3751
3752 /* Skip trailing lead-byte unless 'final' is set */
3753 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003754 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003755
3756 /* First get the size of the result */
3757 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003758 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3759 if (usize == 0) {
3760 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3761 return -1;
3762 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003763 }
3764
3765 if (*v == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003766 /* Create unicode object */
3767 *v = _PyUnicode_New(usize);
3768 if (*v == NULL)
3769 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003770 }
3771 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003772 /* Extend unicode object */
3773 n = PyUnicode_GET_SIZE(*v);
3774 if (_PyUnicode_Resize(v, n + usize) < 0)
3775 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003776 }
3777
3778 /* Do the conversion */
3779 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003780 p = PyUnicode_AS_UNICODE(*v) + n;
3781 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3782 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3783 return -1;
3784 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003785 }
3786
3787 return size;
3788}
3789
3790PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003791 Py_ssize_t size,
3792 const char *errors,
3793 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003794{
3795 PyUnicodeObject *v = NULL;
3796 int done;
3797
3798 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003799 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003800
3801#ifdef NEED_RETRY
3802 retry:
3803 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003804 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003805 else
3806#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003807 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003808
3809 if (done < 0) {
3810 Py_XDECREF(v);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003811 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003812 }
3813
3814 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003815 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003816
3817#ifdef NEED_RETRY
3818 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003819 s += done;
3820 size -= done;
3821 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003822 }
3823#endif
3824
3825 return (PyObject *)v;
3826}
3827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003828PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003829 Py_ssize_t size,
3830 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003831{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003832 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3833}
3834
3835/*
3836 * Convert unicode into string object (MBCS).
3837 * Returns 0 if succeed, -1 otherwise.
3838 */
3839static int encode_mbcs(PyObject **repr,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003840 const Py_UNICODE *p, /* unicode */
3841 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003842{
3843 int mbcssize = 0;
3844 Py_ssize_t n = 0;
3845
3846 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003847
3848 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003849 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003850 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3851 if (mbcssize == 0) {
3852 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3853 return -1;
3854 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003855 }
3856
Martin v. Löwisd8251432006-06-14 05:21:04 +00003857 if (*repr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003858 /* Create string object */
3859 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3860 if (*repr == NULL)
3861 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862 }
3863 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003864 /* Extend string object */
3865 n = PyString_Size(*repr);
3866 if (_PyString_Resize(repr, n + mbcssize) < 0)
3867 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868 }
3869
3870 /* Do the conversion */
3871 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003872 char *s = PyString_AS_STRING(*repr) + n;
3873 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3874 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3875 return -1;
3876 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 }
3878
3879 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003880}
3881
3882PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003883 Py_ssize_t size,
3884 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003885{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 PyObject *repr = NULL;
3887 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003888
Martin v. Löwisd8251432006-06-14 05:21:04 +00003889#ifdef NEED_RETRY
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003890 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003892 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 else
3894#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003895 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897 if (ret < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003898 Py_XDECREF(repr);
3899 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003901
3902#ifdef NEED_RETRY
3903 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003904 p += INT_MAX;
3905 size -= INT_MAX;
3906 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 }
3908#endif
3909
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003910 return repr;
3911}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003912
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003913PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3914{
3915 if (!PyUnicode_Check(unicode)) {
3916 PyErr_BadArgument();
3917 return NULL;
3918 }
3919 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003920 PyUnicode_GET_SIZE(unicode),
3921 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003922}
3923
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924#undef NEED_RETRY
3925
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003926#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003927
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928/* --- Character Mapping Codec -------------------------------------------- */
3929
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003931 Py_ssize_t size,
3932 PyObject *mapping,
3933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t startinpos;
3937 Py_ssize_t endinpos;
3938 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 PyUnicodeObject *v;
3941 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003942 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 PyObject *errorHandler = NULL;
3944 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003945 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 /* Default to Latin-1 */
3949 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003950 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951
3952 v = _PyUnicode_New(size);
3953 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003956 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003960 mapstring = PyUnicode_AS_UNICODE(mapping);
3961 maplen = PyUnicode_GET_SIZE(mapping);
3962 while (s < e) {
3963 unsigned char ch = *s;
3964 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003966 if (ch < maplen)
3967 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003969 if (x == 0xfffe) {
3970 /* undefined mapping */
3971 outpos = p-PyUnicode_AS_UNICODE(v);
3972 startinpos = s-starts;
3973 endinpos = startinpos+1;
3974 if (unicode_decode_call_errorhandler(
3975 errors, &errorHandler,
3976 "charmap", "character maps to <undefined>",
3977 starts, size, &startinpos, &endinpos, &exc, &s,
3978 &v, &outpos, &p)) {
3979 goto onError;
3980 }
3981 continue;
3982 }
3983 *p++ = x;
3984 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003985 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 }
3987 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003988 while (s < e) {
3989 unsigned char ch = *s;
3990 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003991
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003992 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3993 w = PyInt_FromLong((long)ch);
3994 if (w == NULL)
3995 goto onError;
3996 x = PyObject_GetItem(mapping, w);
3997 Py_DECREF(w);
3998 if (x == NULL) {
3999 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4000 /* No mapping found means: mapping is undefined. */
4001 PyErr_Clear();
4002 x = Py_None;
4003 Py_INCREF(x);
4004 } else
4005 goto onError;
4006 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004007
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004008 /* Apply mapping */
4009 if (PyInt_Check(x)) {
4010 long value = PyInt_AS_LONG(x);
4011 if (value < 0 || value > 65535) {
4012 PyErr_SetString(PyExc_TypeError,
4013 "character mapping must be in range(65536)");
4014 Py_DECREF(x);
4015 goto onError;
4016 }
4017 *p++ = (Py_UNICODE)value;
4018 }
4019 else if (x == Py_None) {
4020 /* undefined mapping */
4021 outpos = p-PyUnicode_AS_UNICODE(v);
4022 startinpos = s-starts;
4023 endinpos = startinpos+1;
4024 if (unicode_decode_call_errorhandler(
4025 errors, &errorHandler,
4026 "charmap", "character maps to <undefined>",
4027 starts, size, &startinpos, &endinpos, &exc, &s,
4028 &v, &outpos, &p)) {
4029 Py_DECREF(x);
4030 goto onError;
4031 }
4032 Py_DECREF(x);
4033 continue;
4034 }
4035 else if (PyUnicode_Check(x)) {
4036 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004037
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004038 if (targetsize == 1)
4039 /* 1-1 mapping */
4040 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004041
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004042 else if (targetsize > 1) {
4043 /* 1-n mapping */
4044 if (targetsize > extrachars) {
4045 /* resize first */
4046 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4047 Py_ssize_t needed = (targetsize - extrachars) + \
4048 (targetsize << 2);
4049 extrachars += needed;
4050 /* XXX overflow detection missing */
4051 if (_PyUnicode_Resize(&v,
4052 PyUnicode_GET_SIZE(v) + needed) < 0) {
4053 Py_DECREF(x);
4054 goto onError;
4055 }
4056 p = PyUnicode_AS_UNICODE(v) + oldpos;
4057 }
4058 Py_UNICODE_COPY(p,
4059 PyUnicode_AS_UNICODE(x),
4060 targetsize);
4061 p += targetsize;
4062 extrachars -= targetsize;
4063 }
4064 /* 1-0 mapping: skip the character */
4065 }
4066 else {
4067 /* wrong return value */
4068 PyErr_SetString(PyExc_TypeError,
4069 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004070 Py_DECREF(x);
4071 goto onError;
4072 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004073 Py_DECREF(x);
4074 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
4077 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004078 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 Py_XDECREF(errorHandler);
4081 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004083
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004084 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 Py_XDECREF(errorHandler);
4086 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 Py_XDECREF(v);
4088 return NULL;
4089}
4090
Martin v. Löwis3f767792006-06-04 19:36:28 +00004091/* Charmap encoding: the lookup table */
4092
4093struct encoding_map{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004094 PyObject_HEAD
4095 unsigned char level1[32];
4096 int count2, count3;
4097 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004098};
4099
4100static PyObject*
4101encoding_map_size(PyObject *obj, PyObject* args)
4102{
4103 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004104 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004105 128*map->count3);
4106}
4107
4108static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004109 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004110 PyDoc_STR("Return the size (in bytes) of this object") },
4111 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004112};
4113
4114static void
4115encoding_map_dealloc(PyObject* o)
4116{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004117 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004118}
4119
4120static PyTypeObject EncodingMapType = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004121 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004122 "EncodingMap", /*tp_name*/
4123 sizeof(struct encoding_map), /*tp_basicsize*/
4124 0, /*tp_itemsize*/
4125 /* methods */
4126 encoding_map_dealloc, /*tp_dealloc*/
4127 0, /*tp_print*/
4128 0, /*tp_getattr*/
4129 0, /*tp_setattr*/
4130 0, /*tp_compare*/
4131 0, /*tp_repr*/
4132 0, /*tp_as_number*/
4133 0, /*tp_as_sequence*/
4134 0, /*tp_as_mapping*/
4135 0, /*tp_hash*/
4136 0, /*tp_call*/
4137 0, /*tp_str*/
4138 0, /*tp_getattro*/
4139 0, /*tp_setattro*/
4140 0, /*tp_as_buffer*/
4141 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4142 0, /*tp_doc*/
4143 0, /*tp_traverse*/
4144 0, /*tp_clear*/
4145 0, /*tp_richcompare*/
4146 0, /*tp_weaklistoffset*/
4147 0, /*tp_iter*/
4148 0, /*tp_iternext*/
4149 encoding_map_methods, /*tp_methods*/
4150 0, /*tp_members*/
4151 0, /*tp_getset*/
4152 0, /*tp_base*/
4153 0, /*tp_dict*/
4154 0, /*tp_descr_get*/
4155 0, /*tp_descr_set*/
4156 0, /*tp_dictoffset*/
4157 0, /*tp_init*/
4158 0, /*tp_alloc*/
4159 0, /*tp_new*/
4160 0, /*tp_free*/
4161 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004162};
4163
4164PyObject*
4165PyUnicode_BuildEncodingMap(PyObject* string)
4166{
4167 Py_UNICODE *decode;
4168 PyObject *result;
4169 struct encoding_map *mresult;
4170 int i;
4171 int need_dict = 0;
4172 unsigned char level1[32];
4173 unsigned char level2[512];
4174 unsigned char *mlevel1, *mlevel2, *mlevel3;
4175 int count2 = 0, count3 = 0;
4176
4177 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4178 PyErr_BadArgument();
4179 return NULL;
4180 }
4181 decode = PyUnicode_AS_UNICODE(string);
4182 memset(level1, 0xFF, sizeof level1);
4183 memset(level2, 0xFF, sizeof level2);
4184
4185 /* If there isn't a one-to-one mapping of NULL to \0,
4186 or if there are non-BMP characters, we need to use
4187 a mapping dictionary. */
4188 if (decode[0] != 0)
4189 need_dict = 1;
4190 for (i = 1; i < 256; i++) {
4191 int l1, l2;
4192 if (decode[i] == 0
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004193#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004194 || decode[i] > 0xFFFF
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004195#endif
4196 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004197 need_dict = 1;
4198 break;
4199 }
4200 if (decode[i] == 0xFFFE)
4201 /* unmapped character */
4202 continue;
4203 l1 = decode[i] >> 11;
4204 l2 = decode[i] >> 7;
4205 if (level1[l1] == 0xFF)
4206 level1[l1] = count2++;
4207 if (level2[l2] == 0xFF)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004208 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004209 }
4210
4211 if (count2 >= 0xFF || count3 >= 0xFF)
4212 need_dict = 1;
4213
4214 if (need_dict) {
4215 PyObject *result = PyDict_New();
4216 PyObject *key, *value;
4217 if (!result)
4218 return NULL;
4219 for (i = 0; i < 256; i++) {
4220 key = value = NULL;
4221 key = PyInt_FromLong(decode[i]);
4222 value = PyInt_FromLong(i);
4223 if (!key || !value)
4224 goto failed1;
4225 if (PyDict_SetItem(result, key, value) == -1)
4226 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004227 Py_DECREF(key);
4228 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004229 }
4230 return result;
4231 failed1:
4232 Py_XDECREF(key);
4233 Py_XDECREF(value);
4234 Py_DECREF(result);
4235 return NULL;
4236 }
4237
4238 /* Create a three-level trie */
4239 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4240 16*count2 + 128*count3 - 1);
4241 if (!result)
4242 return PyErr_NoMemory();
4243 PyObject_Init(result, &EncodingMapType);
4244 mresult = (struct encoding_map*)result;
4245 mresult->count2 = count2;
4246 mresult->count3 = count3;
4247 mlevel1 = mresult->level1;
4248 mlevel2 = mresult->level23;
4249 mlevel3 = mresult->level23 + 16*count2;
4250 memcpy(mlevel1, level1, 32);
4251 memset(mlevel2, 0xFF, 16*count2);
4252 memset(mlevel3, 0, 128*count3);
4253 count3 = 0;
4254 for (i = 1; i < 256; i++) {
4255 int o1, o2, o3, i2, i3;
4256 if (decode[i] == 0xFFFE)
4257 /* unmapped character */
4258 continue;
4259 o1 = decode[i]>>11;
4260 o2 = (decode[i]>>7) & 0xF;
4261 i2 = 16*mlevel1[o1] + o2;
4262 if (mlevel2[i2] == 0xFF)
4263 mlevel2[i2] = count3++;
4264 o3 = decode[i] & 0x7F;
4265 i3 = 128*mlevel2[i2] + o3;
4266 mlevel3[i3] = i;
4267 }
4268 return result;
4269}
4270
4271static int
4272encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4273{
4274 struct encoding_map *map = (struct encoding_map*)mapping;
4275 int l1 = c>>11;
4276 int l2 = (c>>7) & 0xF;
4277 int l3 = c & 0x7F;
4278 int i;
4279
4280#ifdef Py_UNICODE_WIDE
4281 if (c > 0xFFFF) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004282 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004283 }
4284#endif
4285 if (c == 0)
4286 return 0;
4287 /* level 1*/
4288 i = map->level1[l1];
4289 if (i == 0xFF) {
4290 return -1;
4291 }
4292 /* level 2*/
4293 i = map->level23[16*i+l2];
4294 if (i == 0xFF) {
4295 return -1;
4296 }
4297 /* level 3 */
4298 i = map->level23[16*map->count2 + 128*i + l3];
4299 if (i == 0) {
4300 return -1;
4301 }
4302 return i;
4303}
4304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305/* Lookup the character ch in the mapping. If the character
4306 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004307 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 PyObject *w = PyInt_FromLong((long)c);
4311 PyObject *x;
4312
4313 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 x = PyObject_GetItem(mapping, w);
4316 Py_DECREF(w);
4317 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004318 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4319 /* No mapping found means: mapping is undefined. */
4320 PyErr_Clear();
4321 x = Py_None;
4322 Py_INCREF(x);
4323 return x;
4324 } else
4325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004327 else if (x == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004328 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004330 long value = PyInt_AS_LONG(x);
4331 if (value < 0 || value > 255) {
4332 PyErr_SetString(PyExc_TypeError,
4333 "character mapping must be in range(256)");
4334 Py_DECREF(x);
4335 return NULL;
4336 }
4337 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004339 else if (PyString_Check(x))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004340 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004342 /* wrong return value */
4343 PyErr_SetString(PyExc_TypeError,
4344 "character mapping must return integer, None or str");
4345 Py_DECREF(x);
4346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
4348}
4349
Martin v. Löwis3f767792006-06-04 19:36:28 +00004350static int
4351charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4352{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004353 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4354 /* exponentially overallocate to minimize reallocations */
4355 if (requiredsize < 2*outsize)
4356 requiredsize = 2*outsize;
4357 if (_PyString_Resize(outobj, requiredsize)) {
4358 return 0;
4359 }
4360 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361}
4362
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004363typedef enum charmapencode_result {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004364 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004365}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366/* lookup the character, put the result in the output string and adjust
4367 various state variables. Reallocate the output string if not enough
4368 space is available. Return a new reference to the object that
4369 was put in the output buffer, or Py_None, if the mapping was undefined
4370 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004371 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004374 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004376 PyObject *rep;
4377 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004378 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379
Christian Heimese93237d2007-12-19 02:37:44 +00004380 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004381 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004382 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004383 if (res == -1)
4384 return enc_FAILED;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004385 if (outsize<requiredsize)
4386 if (!charmapencode_resize(outobj, outpos, requiredsize))
4387 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004388 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004389 outstart[(*outpos)++] = (char)res;
4390 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004391 }
4392
4393 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (rep==NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004395 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396 else if (rep==Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004397 Py_DECREF(rep);
4398 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004400 if (PyInt_Check(rep)) {
4401 Py_ssize_t requiredsize = *outpos+1;
4402 if (outsize<requiredsize)
4403 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4404 Py_DECREF(rep);
4405 return enc_EXCEPTION;
4406 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004407 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004408 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004409 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004410 else {
4411 const char *repchars = PyString_AS_STRING(rep);
4412 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4413 Py_ssize_t requiredsize = *outpos+repsize;
4414 if (outsize<requiredsize)
4415 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4416 Py_DECREF(rep);
4417 return enc_EXCEPTION;
4418 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004419 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004420 memcpy(outstart + *outpos, repchars, repsize);
4421 *outpos += repsize;
4422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 }
Georg Brandl9f167602006-06-04 21:46:16 +00004424 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004425 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426}
4427
4428/* handle an error in PyUnicode_EncodeCharmap
4429 Return 0 on success, -1 on error */
4430static
4431int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004434 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436{
4437 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t repsize;
4439 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 Py_UNICODE *uni2;
4441 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t collstartpos = *inpos;
4443 Py_ssize_t collendpos = *inpos+1;
4444 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 char *encoding = "charmap";
4446 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004447 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 /* find all unencodable characters */
4450 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004451 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004452 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004453 int res = encoding_map_lookup(p[collendpos], mapping);
4454 if (res != -1)
4455 break;
4456 ++collendpos;
4457 continue;
4458 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004459
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004460 rep = charmapencode_lookup(p[collendpos], mapping);
4461 if (rep==NULL)
4462 return -1;
4463 else if (rep!=Py_None) {
4464 Py_DECREF(rep);
4465 break;
4466 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004467 Py_DECREF(rep);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004468 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 }
4470 /* cache callback name lookup
4471 * (if not done yet, i.e. it's the first error) */
4472 if (*known_errorHandler==-1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004473 if ((errors==NULL) || (!strcmp(errors, "strict")))
4474 *known_errorHandler = 1;
4475 else if (!strcmp(errors, "replace"))
4476 *known_errorHandler = 2;
4477 else if (!strcmp(errors, "ignore"))
4478 *known_errorHandler = 3;
4479 else if (!strcmp(errors, "xmlcharrefreplace"))
4480 *known_errorHandler = 4;
4481 else
4482 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 }
4484 switch (*known_errorHandler) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004485 case 1: /* strict */
4486 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4487 return -1;
4488 case 2: /* replace */
4489 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004490 x = charmapencode_output('?', mapping, res, respos);
4491 if (x==enc_EXCEPTION) {
4492 return -1;
4493 }
4494 else if (x==enc_FAILED) {
4495 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4496 return -1;
4497 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004498 }
4499 /* fall through */
4500 case 3: /* ignore */
4501 *inpos = collendpos;
4502 break;
4503 case 4: /* xmlcharrefreplace */
4504 /* generate replacement (temporarily (mis)uses p) */
4505 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004506 char buffer[2+29+1+1];
4507 char *cp;
4508 sprintf(buffer, "&#%d;", (int)p[collpos]);
4509 for (cp = buffer; *cp; ++cp) {
4510 x = charmapencode_output(*cp, mapping, res, respos);
4511 if (x==enc_EXCEPTION)
4512 return -1;
4513 else if (x==enc_FAILED) {
4514 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4515 return -1;
4516 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004517 }
4518 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004519 *inpos = collendpos;
4520 break;
4521 default:
4522 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004523 encoding, reason, p, size, exceptionObject,
4524 collstartpos, collendpos, &newpos);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004525 if (repunicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004526 return -1;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004527 /* generate replacement */
4528 repsize = PyUnicode_GET_SIZE(repunicode);
4529 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004530 x = charmapencode_output(*uni2, mapping, res, respos);
4531 if (x==enc_EXCEPTION) {
4532 return -1;
4533 }
4534 else if (x==enc_FAILED) {
4535 Py_DECREF(repunicode);
4536 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4537 return -1;
4538 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004539 }
4540 *inpos = newpos;
4541 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 }
4543 return 0;
4544}
4545
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004547 Py_ssize_t size,
4548 PyObject *mapping,
4549 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 /* output object */
4552 PyObject *res = NULL;
4553 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 PyObject *errorHandler = NULL;
4558 PyObject *exc = NULL;
4559 /* the following variable is used for caching string comparisons
4560 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4561 * 3=ignore, 4=xmlcharrefreplace */
4562 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563
4564 /* Default to Latin-1 */
4565 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004566 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 /* allocate enough for a simple encoding without
4569 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004570 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (res == NULL)
4572 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004573 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004574 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 while (inpos<size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004577 /* try to encode it */
4578 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4579 if (x==enc_EXCEPTION) /* error */
4580 goto onError;
4581 if (x==enc_FAILED) { /* unencodable character */
4582 if (charmap_encoding_error(p, size, &inpos, mapping,
4583 &exc,
4584 &known_errorHandler, &errorHandler, errors,
4585 &res, &respos)) {
4586 goto onError;
4587 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004588 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004589 else
4590 /* done with this character => adjust input position */
4591 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004595 if (respos<PyString_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004596 if (_PyString_Resize(&res, respos))
4597 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 }
4599 Py_XDECREF(exc);
4600 Py_XDECREF(errorHandler);
4601 return res;
4602
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004603 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_XDECREF(res);
4605 Py_XDECREF(exc);
4606 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 return NULL;
4608}
4609
4610PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004611 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
4613 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004614 PyErr_BadArgument();
4615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 }
4617 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004618 PyUnicode_GET_SIZE(unicode),
4619 mapping,
4620 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621}
4622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623/* create or adjust a UnicodeTranslateError */
4624static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004625 const Py_UNICODE *unicode, Py_ssize_t size,
4626 Py_ssize_t startpos, Py_ssize_t endpos,
4627 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004630 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004631 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 }
4633 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004634 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4635 goto onError;
4636 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4637 goto onError;
4638 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4639 goto onError;
4640 return;
4641 onError:
4642 Py_DECREF(*exceptionObject);
4643 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644 }
4645}
4646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647/* raises a UnicodeTranslateError */
4648static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004649 const Py_UNICODE *unicode, Py_ssize_t size,
4650 Py_ssize_t startpos, Py_ssize_t endpos,
4651 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652{
4653 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004654 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004656 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657}
4658
4659/* error handling callback helper:
4660 build arguments, call the callback and check the arguments,
4661 put the result into newpos and return the replacement string, which
4662 has to be freed by the caller */
4663static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004664 PyObject **errorHandler,
4665 const char *reason,
4666 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4667 Py_ssize_t startpos, Py_ssize_t endpos,
4668 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004670 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671
Martin v. Löwis412fb672006-04-13 06:34:32 +00004672 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 PyObject *restuple;
4674 PyObject *resunicode;
4675
4676 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004677 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 }
4681
4682 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004683 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686
4687 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004688 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00004692 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004693 Py_DECREF(restuple);
4694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 }
4696 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004697 &resunicode, &i_newpos)) {
4698 Py_DECREF(restuple);
4699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 if (i_newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004702 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004703 else
4704 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004705 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004706 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4707 Py_DECREF(restuple);
4708 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_INCREF(resunicode);
4711 Py_DECREF(restuple);
4712 return resunicode;
4713}
4714
4715/* Lookup the character ch in the mapping and put the result in result,
4716 which must be decrefed by the caller.
4717 Return 0 on success, -1 on error */
4718static
4719int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4720{
4721 PyObject *w = PyInt_FromLong((long)c);
4722 PyObject *x;
4723
4724 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004725 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 x = PyObject_GetItem(mapping, w);
4727 Py_DECREF(w);
4728 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004729 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4730 /* No mapping found means: use 1:1 mapping. */
4731 PyErr_Clear();
4732 *result = NULL;
4733 return 0;
4734 } else
4735 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 }
4737 else if (x == Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004738 *result = x;
4739 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 }
4741 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004742 long value = PyInt_AS_LONG(x);
4743 long max = PyUnicode_GetMax();
4744 if (value < 0 || value > max) {
4745 PyErr_Format(PyExc_TypeError,
4746 "character mapping must be in range(0x%lx)", max+1);
4747 Py_DECREF(x);
4748 return -1;
4749 }
4750 *result = x;
4751 return 0;
4752 }
4753 else if (PyUnicode_Check(x)) {
4754 *result = x;
4755 return 0;
4756 }
4757 else {
4758 /* wrong return value */
4759 PyErr_SetString(PyExc_TypeError,
4760 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004761 Py_DECREF(x);
4762 return -1;
4763 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764}
4765/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004766 if not reallocate and adjust various state variables.
4767 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768static
Walter Dörwald4894c302003-10-24 14:25:28 +00004769int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004770 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004772 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004773 if (requiredsize > oldsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004774 /* remember old output position */
4775 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4776 /* exponentially overallocate to minimize reallocations */
4777 if (requiredsize < 2 * oldsize)
4778 requiredsize = 2 * oldsize;
4779 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4780 return -1;
4781 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 }
4783 return 0;
4784}
4785/* lookup the character, put the result in the output string and adjust
4786 various state variables. Return a new reference to the object that
4787 was put in the output buffer in *result, or Py_None, if the mapping was
4788 undefined (in which case no character was written).
4789 The called must decref result.
4790 Return 0 on success, -1 on error. */
4791static
Walter Dörwald4894c302003-10-24 14:25:28 +00004792int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004793 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4794 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795{
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004797 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 if (*res==NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004799 /* not found => default to 1:1 mapping */
4800 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 }
4802 else if (*res==Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004803 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 else if (PyInt_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004805 /* no overflow check, because we know that the space is enough */
4806 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 }
4808 else if (PyUnicode_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004809 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4810 if (repsize==1) {
4811 /* no overflow check, because we know that the space is enough */
4812 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4813 }
4814 else if (repsize!=0) {
4815 /* more than one character */
4816 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4817 (insize - (curinp-startinp)) +
4818 repsize - 1;
4819 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4820 return -1;
4821 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4822 *outp += repsize;
4823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004826 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 return 0;
4828}
4829
4830PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004831 Py_ssize_t size,
4832 PyObject *mapping,
4833 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 /* output object */
4836 PyObject *res = NULL;
4837 /* pointers to the beginning and end+1 of input */
4838 const Py_UNICODE *startp = p;
4839 const Py_UNICODE *endp = p + size;
4840 /* pointer into the output */
4841 Py_UNICODE *str;
4842 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004843 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 char *reason = "character maps to <undefined>";
4845 PyObject *errorHandler = NULL;
4846 PyObject *exc = NULL;
4847 /* the following variable is used for caching string comparisons
4848 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4849 * 3=ignore, 4=xmlcharrefreplace */
4850 int known_errorHandler = -1;
4851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004853 PyErr_BadArgument();
4854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856
4857 /* allocate enough for a simple 1:1 translation without
4858 replacements, if we need more, we'll resize */
4859 res = PyUnicode_FromUnicode(NULL, size);
4860 if (res == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004863 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004867 /* try to encode it */
4868 PyObject *x = NULL;
4869 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4870 Py_XDECREF(x);
4871 goto onError;
4872 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004873 Py_XDECREF(x);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004874 if (x!=Py_None) /* it worked => adjust input pointer */
4875 ++p;
4876 else { /* untranslatable character */
4877 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4878 Py_ssize_t repsize;
4879 Py_ssize_t newpos;
4880 Py_UNICODE *uni2;
4881 /* startpos for collecting untranslatable chars */
4882 const Py_UNICODE *collstart = p;
4883 const Py_UNICODE *collend = p+1;
4884 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004886 /* find all untranslatable characters */
4887 while (collend < endp) {
4888 if (charmaptranslate_lookup(*collend, mapping, &x))
4889 goto onError;
4890 Py_XDECREF(x);
4891 if (x!=Py_None)
4892 break;
4893 ++collend;
4894 }
4895 /* cache callback name lookup
4896 * (if not done yet, i.e. it's the first error) */
4897 if (known_errorHandler==-1) {
4898 if ((errors==NULL) || (!strcmp(errors, "strict")))
4899 known_errorHandler = 1;
4900 else if (!strcmp(errors, "replace"))
4901 known_errorHandler = 2;
4902 else if (!strcmp(errors, "ignore"))
4903 known_errorHandler = 3;
4904 else if (!strcmp(errors, "xmlcharrefreplace"))
4905 known_errorHandler = 4;
4906 else
4907 known_errorHandler = 0;
4908 }
4909 switch (known_errorHandler) {
4910 case 1: /* strict */
4911 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004912 goto onError;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004913 case 2: /* replace */
4914 /* No need to check for space, this is a 1:1 replacement */
4915 for (coll = collstart; coll<collend; ++coll)
4916 *str++ = '?';
4917 /* fall through */
4918 case 3: /* ignore */
4919 p = collend;
4920 break;
4921 case 4: /* xmlcharrefreplace */
4922 /* generate replacement (temporarily (mis)uses p) */
4923 for (p = collstart; p < collend; ++p) {
4924 char buffer[2+29+1+1];
4925 char *cp;
4926 sprintf(buffer, "&#%d;", (int)*p);
4927 if (charmaptranslate_makespace(&res, &str,
4928 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4929 goto onError;
4930 for (cp = buffer; *cp; ++cp)
4931 *str++ = *cp;
4932 }
4933 p = collend;
4934 break;
4935 default:
4936 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4937 reason, startp, size, &exc,
4938 collstart-startp, collend-startp, &newpos);
4939 if (repunicode == NULL)
4940 goto onError;
4941 /* generate replacement */
4942 repsize = PyUnicode_GET_SIZE(repunicode);
4943 if (charmaptranslate_makespace(&res, &str,
4944 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4945 Py_DECREF(repunicode);
4946 goto onError;
4947 }
4948 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4949 *str++ = *uni2;
4950 p = startp + newpos;
4951 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004952 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004953 }
4954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 /* Resize if we allocated to much */
4956 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004957 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004958 if (PyUnicode_Resize(&res, respos) < 0)
4959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 }
4961 Py_XDECREF(exc);
4962 Py_XDECREF(errorHandler);
4963 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004965 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 Py_XDECREF(res);
4967 Py_XDECREF(exc);
4968 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 return NULL;
4970}
4971
4972PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004973 PyObject *mapping,
4974 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975{
4976 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004977
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 str = PyUnicode_FromObject(str);
4979 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004982 PyUnicode_GET_SIZE(str),
4983 mapping,
4984 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 Py_DECREF(str);
4986 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004987
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 Py_XDECREF(str);
4990 return NULL;
4991}
Tim Petersced69f82003-09-16 20:30:58 +00004992
Guido van Rossum9e896b32000-04-05 20:11:21 +00004993/* --- Decimal Encoder ---------------------------------------------------- */
4994
4995int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004996 Py_ssize_t length,
4997 char *output,
4998 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999{
5000 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 PyObject *errorHandler = NULL;
5002 PyObject *exc = NULL;
5003 const char *encoding = "decimal";
5004 const char *reason = "invalid decimal Unicode string";
5005 /* the following variable is used for caching string comparisons
5006 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5007 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005008
5009 if (output == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005010 PyErr_BadArgument();
5011 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 }
5013
5014 p = s;
5015 end = s + length;
5016 while (p < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005017 register Py_UNICODE ch = *p;
5018 int decimal;
5019 PyObject *repunicode;
5020 Py_ssize_t repsize;
5021 Py_ssize_t newpos;
5022 Py_UNICODE *uni2;
5023 Py_UNICODE *collstart;
5024 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005025
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005026 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005027 *output++ = ' ';
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005028 ++p;
5029 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005030 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005031 decimal = Py_UNICODE_TODECIMAL(ch);
5032 if (decimal >= 0) {
5033 *output++ = '0' + decimal;
5034 ++p;
5035 continue;
5036 }
5037 if (0 < ch && ch < 256) {
5038 *output++ = (char)ch;
5039 ++p;
5040 continue;
5041 }
5042 /* All other characters are considered unencodable */
5043 collstart = p;
5044 collend = p+1;
5045 while (collend < end) {
5046 if ((0 < *collend && *collend < 256) ||
5047 !Py_UNICODE_ISSPACE(*collend) ||
5048 Py_UNICODE_TODECIMAL(*collend))
5049 break;
5050 }
5051 /* cache callback name lookup
5052 * (if not done yet, i.e. it's the first error) */
5053 if (known_errorHandler==-1) {
5054 if ((errors==NULL) || (!strcmp(errors, "strict")))
5055 known_errorHandler = 1;
5056 else if (!strcmp(errors, "replace"))
5057 known_errorHandler = 2;
5058 else if (!strcmp(errors, "ignore"))
5059 known_errorHandler = 3;
5060 else if (!strcmp(errors, "xmlcharrefreplace"))
5061 known_errorHandler = 4;
5062 else
5063 known_errorHandler = 0;
5064 }
5065 switch (known_errorHandler) {
5066 case 1: /* strict */
5067 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5068 goto onError;
5069 case 2: /* replace */
5070 for (p = collstart; p < collend; ++p)
5071 *output++ = '?';
5072 /* fall through */
5073 case 3: /* ignore */
5074 p = collend;
5075 break;
5076 case 4: /* xmlcharrefreplace */
5077 /* generate replacement (temporarily (mis)uses p) */
5078 for (p = collstart; p < collend; ++p)
5079 output += sprintf(output, "&#%d;", (int)*p);
5080 p = collend;
5081 break;
5082 default:
5083 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5084 encoding, reason, s, length, &exc,
5085 collstart-s, collend-s, &newpos);
5086 if (repunicode == NULL)
5087 goto onError;
5088 /* generate replacement */
5089 repsize = PyUnicode_GET_SIZE(repunicode);
5090 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5091 Py_UNICODE ch = *uni2;
5092 if (Py_UNICODE_ISSPACE(ch))
5093 *output++ = ' ';
5094 else {
5095 decimal = Py_UNICODE_TODECIMAL(ch);
5096 if (decimal >= 0)
5097 *output++ = '0' + decimal;
5098 else if (0 < ch && ch < 256)
5099 *output++ = (char)ch;
5100 else {
5101 Py_DECREF(repunicode);
5102 raise_encode_exception(&exc, encoding,
5103 s, length, collstart-s, collend-s, reason);
5104 goto onError;
5105 }
5106 }
5107 }
5108 p = s + newpos;
5109 Py_DECREF(repunicode);
5110 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005111 }
5112 /* 0-terminate the output string */
5113 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 Py_XDECREF(exc);
5115 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005116 return 0;
5117
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005118 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 Py_XDECREF(exc);
5120 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121 return -1;
5122}
5123
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124/* --- Helpers ------------------------------------------------------------ */
5125
Eric Smitha9f7d622008-02-17 19:46:49 +00005126#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005127
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005128#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005129
Fredrik Lundha50d2012006-05-26 17:04:58 +00005130#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005131
5132#include "stringlib/count.h"
5133#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005134#include "stringlib/partition.h"
5135
Fredrik Lundhc8162812006-05-26 19:33:03 +00005136/* helper macro to fixup start/end slice values */
5137#define FIX_START_END(obj) \
5138 if (start < 0) \
5139 start += (obj)->length; \
5140 if (start < 0) \
5141 start = 0; \
5142 if (end > (obj)->length) \
5143 end = (obj)->length; \
5144 if (end < 0) \
5145 end += (obj)->length; \
5146 if (end < 0) \
5147 end = 0;
5148
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005150 PyObject *substr,
5151 Py_ssize_t start,
5152 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005154 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005155 PyUnicodeObject* str_obj;
5156 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005157
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005158 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5159 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005160 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005161 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5162 if (!sub_obj) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005163 Py_DECREF(str_obj);
5164 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 }
Tim Petersced69f82003-09-16 20:30:58 +00005166
Fredrik Lundhc8162812006-05-26 19:33:03 +00005167 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005168
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005169 result = stringlib_count(
5170 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5171 );
5172
5173 Py_DECREF(sub_obj);
5174 Py_DECREF(str_obj);
5175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 return result;
5177}
5178
Martin v. Löwis18e16552006-02-15 17:27:45 +00005179Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005180 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005181 Py_ssize_t start,
5182 Py_ssize_t end,
5183 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005185 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005187 str = PyUnicode_FromObject(str);
5188 if (!str)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005189 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005190 sub = PyUnicode_FromObject(sub);
5191 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005192 Py_DECREF(str);
5193 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 }
Tim Petersced69f82003-09-16 20:30:58 +00005195
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005196 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005197 result = stringlib_find_slice(
5198 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5199 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5200 start, end
5201 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005202 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005203 result = stringlib_rfind_slice(
5204 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5205 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5206 start, end
5207 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005208
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005209 Py_DECREF(str);
5210 Py_DECREF(sub);
5211
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 return result;
5213}
5214
Tim Petersced69f82003-09-16 20:30:58 +00005215static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216int tailmatch(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005217 PyUnicodeObject *substring,
5218 Py_ssize_t start,
5219 Py_ssize_t end,
5220 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (substring->length == 0)
5223 return 1;
5224
Fredrik Lundhc8162812006-05-26 19:33:03 +00005225 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
5227 end -= substring->length;
5228 if (end < start)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005229 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
5231 if (direction > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005232 if (Py_UNICODE_MATCH(self, end, substring))
5233 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 } else {
5235 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005236 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 }
5238
5239 return 0;
5240}
5241
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005243 PyObject *substr,
5244 Py_ssize_t start,
5245 Py_ssize_t end,
5246 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 str = PyUnicode_FromObject(str);
5251 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005252 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 substr = PyUnicode_FromObject(substr);
5254 if (substr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005255 Py_DECREF(str);
5256 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 }
Tim Petersced69f82003-09-16 20:30:58 +00005258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005260 (PyUnicodeObject *)substr,
5261 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 Py_DECREF(str);
5263 Py_DECREF(substr);
5264 return result;
5265}
5266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267/* Apply fixfct filter to the Unicode object self and return a
5268 reference to the modified object */
5269
Tim Petersced69f82003-09-16 20:30:58 +00005270static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005272 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
5274
5275 PyUnicodeObject *u;
5276
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005277 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005279 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005280
5281 Py_UNICODE_COPY(u->str, self->str, self->length);
5282
Tim Peters7a29bd52001-09-12 03:03:31 +00005283 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005284 /* fixfct should return TRUE if it modified the buffer. If
5285 FALSE, return a reference to the original buffer instead
5286 (to save space, not time) */
5287 Py_INCREF(self);
5288 Py_DECREF(u);
5289 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
5291 return (PyObject*) u;
5292}
5293
Tim Petersced69f82003-09-16 20:30:58 +00005294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295int fixupper(PyUnicodeObject *self)
5296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 Py_UNICODE *s = self->str;
5299 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005302 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005303
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005304 ch = Py_UNICODE_TOUPPER(*s);
5305 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005307 *s = ch;
5308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 s++;
5310 }
5311
5312 return status;
5313}
5314
Tim Petersced69f82003-09-16 20:30:58 +00005315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316int fixlower(PyUnicodeObject *self)
5317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 Py_UNICODE *s = self->str;
5320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005323 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005324
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005325 ch = Py_UNICODE_TOLOWER(*s);
5326 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005328 *s = ch;
5329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 s++;
5331 }
5332
5333 return status;
5334}
5335
Tim Petersced69f82003-09-16 20:30:58 +00005336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337int fixswapcase(PyUnicodeObject *self)
5338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 Py_UNICODE *s = self->str;
5341 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 while (len-- > 0) {
5344 if (Py_UNICODE_ISUPPER(*s)) {
5345 *s = Py_UNICODE_TOLOWER(*s);
5346 status = 1;
5347 } else if (Py_UNICODE_ISLOWER(*s)) {
5348 *s = Py_UNICODE_TOUPPER(*s);
5349 status = 1;
5350 }
5351 s++;
5352 }
5353
5354 return status;
5355}
5356
Tim Petersced69f82003-09-16 20:30:58 +00005357static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358int fixcapitalize(PyUnicodeObject *self)
5359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005360 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005361 Py_UNICODE *s = self->str;
5362 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005363
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005364 if (len == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005365 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005366 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005367 *s = Py_UNICODE_TOUPPER(*s);
5368 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005370 s++;
5371 while (--len > 0) {
5372 if (Py_UNICODE_ISUPPER(*s)) {
5373 *s = Py_UNICODE_TOLOWER(*s);
5374 status = 1;
5375 }
5376 s++;
5377 }
5378 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379}
5380
5381static
5382int fixtitle(PyUnicodeObject *self)
5383{
5384 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register Py_UNICODE *e;
5386 int previous_is_cased;
5387
5388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005390 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5391 if (*p != ch) {
5392 *p = ch;
5393 return 1;
5394 }
5395 else
5396 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
Tim Petersced69f82003-09-16 20:30:58 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 e = p + PyUnicode_GET_SIZE(self);
5400 previous_is_cased = 0;
5401 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005402 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005403
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005404 if (previous_is_cased)
5405 *p = Py_UNICODE_TOLOWER(ch);
5406 else
5407 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005408
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005409 if (Py_UNICODE_ISLOWER(ch) ||
5410 Py_UNICODE_ISUPPER(ch) ||
5411 Py_UNICODE_ISTITLE(ch))
5412 previous_is_cased = 1;
5413 else
5414 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
5416 return 1;
5417}
5418
Tim Peters8ce9f162004-08-27 01:49:32 +00005419PyObject *
5420PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Tim Peters8ce9f162004-08-27 01:49:32 +00005422 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005423 const Py_UNICODE blank = ' ';
5424 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005425 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005426 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005427 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5428 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5430 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005431 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005432 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005433 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 fseq = PySequence_Fast(seq, "");
5436 if (fseq == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005437 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 }
5439
Tim Peters91879ab2004-08-27 22:35:44 +00005440 /* Grrrr. A codec may be invoked to convert str objects to
5441 * Unicode, and so it's possible to call back into Python code
5442 * during PyUnicode_FromObject(), and so it's possible for a sick
5443 * codec to change the size of fseq (if seq is a list). Therefore
5444 * we have to keep refetching the size -- can't assume seqlen
5445 * is invariant.
5446 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005447 seqlen = PySequence_Fast_GET_SIZE(fseq);
5448 /* If empty sequence, return u"". */
5449 if (seqlen == 0) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005450 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5451 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 }
5453 /* If singleton sequence with an exact Unicode, return that. */
5454 if (seqlen == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005455 item = PySequence_Fast_GET_ITEM(fseq, 0);
5456 if (PyUnicode_CheckExact(item)) {
5457 Py_INCREF(item);
5458 res = (PyUnicodeObject *)item;
5459 goto Done;
5460 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005461 }
5462
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 /* At least two items to join, or one that isn't exact Unicode. */
5464 if (seqlen > 1) {
5465 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005466 if (separator == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005467 sep = &blank;
5468 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005469 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005470 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005471 internal_separator = PyUnicode_FromObject(separator);
5472 if (internal_separator == NULL)
5473 goto onError;
5474 sep = PyUnicode_AS_UNICODE(internal_separator);
5475 seplen = PyUnicode_GET_SIZE(internal_separator);
5476 /* In case PyUnicode_FromObject() mutated seq. */
5477 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 }
5479 }
5480
5481 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005482 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005485 res_p = PyUnicode_AS_UNICODE(res);
5486 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005487
Tim Peters05eba1f2004-08-27 21:32:02 +00005488 for (i = 0; i < seqlen; ++i) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005489 Py_ssize_t itemlen;
5490 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005491
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005492 item = PySequence_Fast_GET_ITEM(fseq, i);
5493 /* Convert item to Unicode. */
5494 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5495 PyErr_Format(PyExc_TypeError,
5496 "sequence item %zd: expected string or Unicode,"
5497 " %.80s found",
5498 i, Py_TYPE(item)->tp_name);
5499 goto onError;
5500 }
5501 item = PyUnicode_FromObject(item);
5502 if (item == NULL)
5503 goto onError;
5504 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005505
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005506 /* In case PyUnicode_FromObject() mutated seq. */
5507 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005508
Tim Peters8ce9f162004-08-27 01:49:32 +00005509 /* Make sure we have enough space for the separator and the item. */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005510 itemlen = PyUnicode_GET_SIZE(item);
5511 new_res_used = res_used + itemlen;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005512 if (new_res_used < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005513 goto Overflow;
5514 if (i < seqlen - 1) {
5515 new_res_used += seplen;
5516 if (new_res_used < 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005517 goto Overflow;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005518 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005519 if (new_res_used > res_alloc) {
5520 /* double allocated size until it's big enough */
5521 do {
5522 res_alloc += res_alloc;
5523 if (res_alloc <= 0)
5524 goto Overflow;
5525 } while (new_res_used > res_alloc);
5526 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5527 Py_DECREF(item);
5528 goto onError;
5529 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005531 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005532
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005533 /* Copy item, and maybe the separator. */
5534 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5535 res_p += itemlen;
5536 if (i < seqlen - 1) {
5537 Py_UNICODE_COPY(res_p, sep, seplen);
5538 res_p += seplen;
5539 }
5540 Py_DECREF(item);
5541 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005542 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005543
Tim Peters05eba1f2004-08-27 21:32:02 +00005544 /* Shrink res to match the used area; this probably can't fail,
5545 * but it's cheap to check.
5546 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005547 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005548 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005549
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005550 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005551 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005552 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 return (PyObject *)res;
5554
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005555 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005557 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 Py_DECREF(item);
5559 /* fall through */
5560
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005561 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005564 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 return NULL;
5566}
5567
Tim Petersced69f82003-09-16 20:30:58 +00005568static
5569PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005570 Py_ssize_t left,
5571 Py_ssize_t right,
5572 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573{
5574 PyUnicodeObject *u;
5575
5576 if (left < 0)
5577 left = 0;
5578 if (right < 0)
5579 right = 0;
5580
Tim Peters7a29bd52001-09-12 03:03:31 +00005581 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 Py_INCREF(self);
5583 return self;
5584 }
5585
Neal Norwitze7d8be82008-07-31 17:17:14 +00005586 if (left > PY_SSIZE_T_MAX - self->length ||
5587 right > PY_SSIZE_T_MAX - (left + self->length)) {
5588 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5589 return NULL;
5590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 u = _PyUnicode_New(left + self->length + right);
5592 if (u) {
5593 if (left)
5594 Py_UNICODE_FILL(u->str, fill, left);
5595 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5596 if (right)
5597 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5598 }
5599
5600 return u;
5601}
5602
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005603#define SPLIT_APPEND(data, left, right) \
5604 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5605 if (!str) \
5606 goto onError; \
5607 if (PyList_Append(list, str)) { \
5608 Py_DECREF(str); \
5609 goto onError; \
5610 } \
5611 else \
5612 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614static
5615PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005616 PyObject *list,
5617 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619 register Py_ssize_t i;
5620 register Py_ssize_t j;
5621 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005623 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
5625 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005626 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005627 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005628 i++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005629 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005630 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5631 i++;
5632 if (j < i) {
5633 if (maxcount-- <= 0)
5634 break;
5635 SPLIT_APPEND(buf, j, i);
5636 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5637 i++;
5638 j = i;
5639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005642 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 }
5644 return list;
5645
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005646 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 Py_DECREF(list);
5648 return NULL;
5649}
5650
5651PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005652 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654 register Py_ssize_t i;
5655 register Py_ssize_t j;
5656 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 PyObject *list;
5658 PyObject *str;
5659 Py_UNICODE *data;
5660
5661 string = PyUnicode_FromObject(string);
5662 if (string == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 data = PyUnicode_AS_UNICODE(string);
5665 len = PyUnicode_GET_SIZE(string);
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 list = PyList_New(0);
5668 if (!list)
5669 goto onError;
5670
5671 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005672 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005673
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005674 /* Find a line and append it */
5675 while (i < len && !BLOOM_LINEBREAK(data[i]))
5676 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005678 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005679 eol = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005680 if (i < len) {
5681 if (data[i] == '\r' && i + 1 < len &&
5682 data[i+1] == '\n')
5683 i += 2;
5684 else
5685 i++;
5686 if (keepends)
5687 eol = i;
5688 }
5689 SPLIT_APPEND(data, j, eol);
5690 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005693 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 }
5695
5696 Py_DECREF(string);
5697 return list;
5698
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005699 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005700 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 Py_DECREF(string);
5702 return NULL;
5703}
5704
Tim Petersced69f82003-09-16 20:30:58 +00005705static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005707 PyObject *list,
5708 Py_UNICODE ch,
5709 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 register Py_ssize_t i;
5712 register Py_ssize_t j;
5713 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005715 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
5717 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005718 if (buf[i] == ch) {
5719 if (maxcount-- <= 0)
5720 break;
5721 SPLIT_APPEND(buf, j, i);
5722 i = j = i + 1;
5723 } else
5724 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 }
5726 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005727 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 }
5729 return list;
5730
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005731 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 Py_DECREF(list);
5733 return NULL;
5734}
5735
Tim Petersced69f82003-09-16 20:30:58 +00005736static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005738 PyObject *list,
5739 PyUnicodeObject *substring,
5740 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005742 register Py_ssize_t i;
5743 register Py_ssize_t j;
5744 Py_ssize_t len = self->length;
5745 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 PyObject *str;
5747
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005748 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005749 if (Py_UNICODE_MATCH(self, i, substring)) {
5750 if (maxcount-- <= 0)
5751 break;
5752 SPLIT_APPEND(self->str, j, i);
5753 i = j = i + sublen;
5754 } else
5755 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 }
5757 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005758 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
5760 return list;
5761
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005762 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 Py_DECREF(list);
5764 return NULL;
5765}
5766
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767static
5768PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005769 PyObject *list,
5770 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 register Py_ssize_t i;
5773 register Py_ssize_t j;
5774 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005776 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777
5778 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005779 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005780 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005781 i--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005782 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005783 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5784 i--;
5785 if (j > i) {
5786 if (maxcount-- <= 0)
5787 break;
5788 SPLIT_APPEND(buf, i + 1, j + 1);
5789 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5790 i--;
5791 j = i;
5792 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 }
5794 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005795 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005797 if (PyList_Reverse(list) < 0)
5798 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 return list;
5800
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005801 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802 Py_DECREF(list);
5803 return NULL;
5804}
5805
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005806static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005808 PyObject *list,
5809 Py_UNICODE ch,
5810 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 register Py_ssize_t i;
5813 register Py_ssize_t j;
5814 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005815 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005816 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817
5818 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005819 if (buf[i] == ch) {
5820 if (maxcount-- <= 0)
5821 break;
5822 SPLIT_APPEND(buf, i + 1, j + 1);
5823 j = i = i - 1;
5824 } else
5825 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005827 if (j >= -1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005828 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005829 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005830 if (PyList_Reverse(list) < 0)
5831 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832 return list;
5833
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005834 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 Py_DECREF(list);
5836 return NULL;
5837}
5838
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005839static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005841 PyObject *list,
5842 PyUnicodeObject *substring,
5843 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845 register Py_ssize_t i;
5846 register Py_ssize_t j;
5847 Py_ssize_t len = self->length;
5848 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849 PyObject *str;
5850
5851 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005852 if (Py_UNICODE_MATCH(self, i, substring)) {
5853 if (maxcount-- <= 0)
5854 break;
5855 SPLIT_APPEND(self->str, i + sublen, j);
5856 j = i;
5857 i -= sublen;
5858 } else
5859 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860 }
5861 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005862 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005864 if (PyList_Reverse(list) < 0)
5865 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866 return list;
5867
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005868 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869 Py_DECREF(list);
5870 return NULL;
5871}
5872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873#undef SPLIT_APPEND
5874
5875static
5876PyObject *split(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005877 PyUnicodeObject *substring,
5878 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879{
5880 PyObject *list;
5881
5882 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005883 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
5885 list = PyList_New(0);
5886 if (!list)
5887 return NULL;
5888
5889 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005890 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005893 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005896 Py_DECREF(list);
5897 PyErr_SetString(PyExc_ValueError, "empty separator");
5898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
5900 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005901 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
Tim Petersced69f82003-09-16 20:30:58 +00005904static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005905PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005906 PyUnicodeObject *substring,
5907 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005908{
5909 PyObject *list;
5910
5911 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005912 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005913
5914 list = PyList_New(0);
5915 if (!list)
5916 return NULL;
5917
5918 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005919 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005920
5921 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005922 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005923
5924 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005925 Py_DECREF(list);
5926 PyErr_SetString(PyExc_ValueError, "empty separator");
5927 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928 }
5929 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005930 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931}
5932
5933static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005935 PyUnicodeObject *str1,
5936 PyUnicodeObject *str2,
5937 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
5939 PyUnicodeObject *u;
5940
5941 if (maxcount < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005942 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Fredrik Lundh347ee272006-05-24 16:35:18 +00005944 if (str1->length == str2->length) {
5945 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005946 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005947 if (str1->length == 1) {
5948 /* replace characters */
5949 Py_UNICODE u1, u2;
5950 if (!findchar(self->str, self->length, str1->str[0]))
5951 goto nothing;
5952 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5953 if (!u)
5954 return NULL;
5955 Py_UNICODE_COPY(u->str, self->str, self->length);
5956 u1 = str1->str[0];
5957 u2 = str2->str[0];
5958 for (i = 0; i < u->length; i++)
5959 if (u->str[i] == u1) {
5960 if (--maxcount < 0)
5961 break;
5962 u->str[i] = u2;
5963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005965 i = fastsearch(
5966 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (i < 0)
5969 goto nothing;
5970 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5971 if (!u)
5972 return NULL;
5973 Py_UNICODE_COPY(u->str, self->str, self->length);
5974 while (i <= self->length - str1->length)
5975 if (Py_UNICODE_MATCH(self, i, str1)) {
5976 if (--maxcount < 0)
5977 break;
5978 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5979 i += str1->length;
5980 } else
5981 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005984
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005985 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005986 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 Py_UNICODE *p;
5988
5989 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005990 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 if (n > maxcount)
5992 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993 if (n == 0)
5994 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005995 /* new_size = self->length + n * (str2->length - str1->length)); */
5996 delta = (str2->length - str1->length);
5997 if (delta == 0) {
5998 new_size = self->length;
5999 } else {
6000 product = n * (str2->length - str1->length);
6001 if ((product / (str2->length - str1->length)) != n) {
6002 PyErr_SetString(PyExc_OverflowError,
6003 "replace string is too long");
6004 return NULL;
6005 }
6006 new_size = self->length + product;
6007 if (new_size < 0) {
6008 PyErr_SetString(PyExc_OverflowError,
6009 "replace string is too long");
6010 return NULL;
6011 }
6012 }
6013 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006014 if (!u)
6015 return NULL;
6016 i = 0;
6017 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006018 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006019 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006020 while (n-- > 0) {
6021 /* look for next match */
6022 j = i;
6023 while (j <= e) {
6024 if (Py_UNICODE_MATCH(self, j, str1))
6025 break;
6026 j++;
6027 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006028 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006029 if (j > e)
6030 break;
6031 /* copy unchanged part [i:j] */
6032 Py_UNICODE_COPY(p, self->str+i, j-i);
6033 p += j - i;
6034 }
6035 /* copy substitution string */
6036 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006037 Py_UNICODE_COPY(p, str2->str, str2->length);
6038 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006039 }
6040 i = j + str1->length;
6041 }
6042 if (i < self->length)
6043 /* copy tail [i:] */
6044 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006045 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006046 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006047 while (n > 0) {
6048 Py_UNICODE_COPY(p, str2->str, str2->length);
6049 p += str2->length;
6050 if (--n <= 0)
6051 break;
6052 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006054 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
6056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006058
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006059 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006060 /* nothing to replace; return original string (when possible) */
6061 if (PyUnicode_CheckExact(self)) {
6062 Py_INCREF(self);
6063 return (PyObject *) self;
6064 }
6065 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
6068/* --- Unicode Object Methods --------------------------------------------- */
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(title__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006071 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072\n\
6073Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return fixup(self, fixtitle);
6080}
6081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006082PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006083 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084\n\
6085Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006086have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
6088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006089unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 return fixup(self, fixcapitalize);
6092}
6093
6094#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006096 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097\n\
6098Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006102unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 PyObject *list;
6105 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006106 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 /* Split into words */
6109 list = split(self, NULL, -1);
6110 if (!list)
6111 return NULL;
6112
6113 /* Capitalize each word */
6114 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6115 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006116 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 if (item == NULL)
6118 goto onError;
6119 Py_DECREF(PyList_GET_ITEM(list, i));
6120 PyList_SET_ITEM(list, i, item);
6121 }
6122
6123 /* Join the words to form a new string */
6124 item = PyUnicode_Join(NULL, list);
6125
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006126 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 Py_DECREF(list);
6128 return (PyObject *)item;
6129}
6130#endif
6131
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006132/* Argument converter. Coerces to a single unicode character */
6133
6134static int
6135convert_uc(PyObject *obj, void *addr)
6136{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006137 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6138 PyObject *uniobj;
6139 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006140
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006141 uniobj = PyUnicode_FromObject(obj);
6142 if (uniobj == NULL) {
6143 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006144 "The fill character cannot be converted to Unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006145 return 0;
6146 }
6147 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6148 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006149 "The fill character must be exactly one character long");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006150 Py_DECREF(uniobj);
6151 return 0;
6152 }
6153 unistr = PyUnicode_AS_UNICODE(uniobj);
6154 *fillcharloc = unistr[0];
6155 Py_DECREF(uniobj);
6156 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006157}
6158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006159PyDoc_STRVAR(center__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006160 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006162Return S centered in a Unicode string of length width. Padding is\n\
6163done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
6165static PyObject *
6166unicode_center(PyUnicodeObject *self, PyObject *args)
6167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 Py_ssize_t marg, left;
6169 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006170 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Thomas Woutersde017742006-02-16 19:34:37 +00006172 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 return NULL;
6174
Tim Peters7a29bd52001-09-12 03:03:31 +00006175 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 Py_INCREF(self);
6177 return (PyObject*) self;
6178 }
6179
6180 marg = width - self->length;
6181 left = marg / 2 + (marg & width & 1);
6182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186#if 0
6187
6188/* This code should go into some future Unicode collation support
6189 module. The basic comparison should compare ordinals on a naive
Georg Brandla3c242c2009-10-27 14:19:50 +00006190 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006192/* speedy UTF-16 code point order comparison */
6193/* gleaned from: */
6194/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6195
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006196static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006197{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006198 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006199 0, 0, 0, 0, 0, 0, 0, 0,
6200 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006201 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006202};
6203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204static int
6205unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006207 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 Py_UNICODE *s1 = str1->str;
6210 Py_UNICODE *s2 = str2->str;
6211
6212 len1 = str1->length;
6213 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006214
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006216 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006217
6218 c1 = *s1++;
6219 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006220
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006221 if (c1 > (1<<11) * 26)
6222 c1 += utf16Fixup[c1>>11];
6223 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006224 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006225 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006226
6227 if (c1 != c2)
6228 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006229
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006230 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
6232
6233 return (len1 < len2) ? -1 : (len1 != len2);
6234}
6235
Marc-André Lemburge5034372000-08-08 08:04:29 +00006236#else
6237
6238static int
6239unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006242
6243 Py_UNICODE *s1 = str1->str;
6244 Py_UNICODE *s2 = str2->str;
6245
6246 len1 = str1->length;
6247 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Marc-André Lemburge5034372000-08-08 08:04:29 +00006249 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006250 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006251
Fredrik Lundh45714e92001-06-26 16:39:36 +00006252 c1 = *s1++;
6253 c2 = *s2++;
6254
6255 if (c1 != c2)
6256 return (c1 < c2) ? -1 : 1;
6257
Marc-André Lemburge5034372000-08-08 08:04:29 +00006258 len1--; len2--;
6259 }
6260
6261 return (len1 < len2) ? -1 : (len1 != len2);
6262}
6263
6264#endif
6265
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266int PyUnicode_Compare(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006267 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
6269 PyUnicodeObject *u = NULL, *v = NULL;
6270 int result;
6271
6272 /* Coerce the two arguments */
6273 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6274 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006275 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6277 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Thomas Wouters7e474022000-07-16 12:04:32 +00006280 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 if (v == u) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006282 Py_DECREF(u);
6283 Py_DECREF(v);
6284 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 }
6286
6287 result = unicode_compare(u, v);
6288
6289 Py_DECREF(u);
6290 Py_DECREF(v);
6291 return result;
6292
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_XDECREF(u);
6295 Py_XDECREF(v);
6296 return -1;
6297}
6298
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006299PyObject *PyUnicode_RichCompare(PyObject *left,
6300 PyObject *right,
6301 int op)
6302{
6303 int result;
6304
6305 result = PyUnicode_Compare(left, right);
6306 if (result == -1 && PyErr_Occurred())
6307 goto onError;
6308
6309 /* Convert the return value to a Boolean */
6310 switch (op) {
6311 case Py_EQ:
6312 result = (result == 0);
6313 break;
6314 case Py_NE:
6315 result = (result != 0);
6316 break;
6317 case Py_LE:
6318 result = (result <= 0);
6319 break;
6320 case Py_GE:
6321 result = (result >= 0);
6322 break;
6323 case Py_LT:
6324 result = (result == -1);
6325 break;
6326 case Py_GT:
6327 result = (result == 1);
6328 break;
6329 }
6330 return PyBool_FromLong(result);
6331
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006332 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006333
6334 /* Standard case
6335
6336 Type errors mean that PyUnicode_FromObject() could not convert
6337 one of the arguments (usually the right hand side) to Unicode,
6338 ie. we can't handle the comparison request. However, it is
6339 possible that the other object knows a comparison method, which
6340 is why we return Py_NotImplemented to give the other object a
6341 chance.
6342
6343 */
6344 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6345 PyErr_Clear();
6346 Py_INCREF(Py_NotImplemented);
6347 return Py_NotImplemented;
6348 }
6349 if (op != Py_EQ && op != Py_NE)
6350 return NULL;
6351
6352 /* Equality comparison.
6353
6354 This is a special case: we silence any PyExc_UnicodeDecodeError
6355 and instead turn it into a PyErr_UnicodeWarning.
6356
6357 */
6358 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6359 return NULL;
6360 PyErr_Clear();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006361 if (PyErr_Warn(PyExc_UnicodeWarning,
6362 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006363 "Unicode equal comparison "
6364 "failed to convert both arguments to Unicode - "
6365 "interpreting them as being unequal" :
6366 "Unicode unequal comparison "
6367 "failed to convert both arguments to Unicode - "
6368 "interpreting them as being unequal"
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006369 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006370 return NULL;
6371 result = (op == Py_NE);
6372 return PyBool_FromLong(result);
6373}
6374
Guido van Rossum403d68b2000-03-13 15:55:09 +00006375int PyUnicode_Contains(PyObject *container,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006376 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006377{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006378 PyObject *str, *sub;
6379 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006380
6381 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006382 sub = PyUnicode_FromObject(element);
6383 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006384 PyErr_SetString(PyExc_TypeError,
6385 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006386 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006387 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006388
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006389 str = PyUnicode_FromObject(container);
6390 if (!str) {
6391 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006392 return -1;
6393 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006394
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006395 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006396
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006397 Py_DECREF(str);
6398 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006399
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006400 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006401}
6402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403/* Concat to string or Unicode object giving a new Unicode object. */
6404
6405PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006406 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
6408 PyUnicodeObject *u = NULL, *v = NULL, *w;
6409
6410 /* Coerce the two arguments */
6411 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6412 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006413 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6415 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
6418 /* Shortcuts */
6419 if (v == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006420 Py_DECREF(v);
6421 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 }
6423 if (u == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006424 Py_DECREF(u);
6425 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
6427
6428 /* Concat the two Unicode strings */
6429 w = _PyUnicode_New(u->length + v->length);
6430 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 Py_UNICODE_COPY(w->str, u->str, u->length);
6433 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6434
6435 Py_DECREF(u);
6436 Py_DECREF(v);
6437 return (PyObject *)w;
6438
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006439 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 Py_XDECREF(u);
6441 Py_XDECREF(v);
6442 return NULL;
6443}
6444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006445PyDoc_STRVAR(count__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006446 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006448Return the number of non-overlapping occurrences of substring sub in\n\
6449Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006450interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
6452static PyObject *
6453unicode_count(PyUnicodeObject *self, PyObject *args)
6454{
6455 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006457 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 PyObject *result;
6459
Guido van Rossumb8872e62000-05-09 14:14:27 +00006460 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006461 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return NULL;
6463
6464 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006465 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006467 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006468
Fredrik Lundhc8162812006-05-26 19:33:03 +00006469 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006471 result = PyInt_FromSsize_t(
6472 stringlib_count(self->str + start, end - start,
6473 substring->str, substring->length)
6474 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475
6476 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006477
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return result;
6479}
6480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481PyDoc_STRVAR(encode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006482 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484Encodes S using the codec registered for encoding. encoding defaults\n\
6485to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006486handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6488'xmlcharrefreplace' as well as any other name registered with\n\
6489codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
6491static PyObject *
6492unicode_encode(PyUnicodeObject *self, PyObject *args)
6493{
6494 char *encoding = NULL;
6495 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006496 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006497
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6499 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006500 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006501 if (v == NULL)
6502 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006503 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006504 PyErr_Format(PyExc_TypeError,
6505 "encoder did not return a string/unicode object "
6506 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006507 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006508 Py_DECREF(v);
6509 return NULL;
6510 }
6511 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006512
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006513 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006514 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515}
6516
6517PyDoc_STRVAR(decode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006518 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519\n\
6520Decodes S using the codec registered for encoding. encoding defaults\n\
6521to the default encoding. errors may be given to set a different error\n\
6522handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6523a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6524as well as any other name registerd with codecs.register_error that is\n\
6525able to handle UnicodeDecodeErrors.");
6526
6527static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006528unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006529{
6530 char *encoding = NULL;
6531 char *errors = NULL;
6532 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006533
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006534 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6535 return NULL;
6536 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006537 if (v == NULL)
6538 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006539 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006540 PyErr_Format(PyExc_TypeError,
6541 "decoder did not return a string/unicode object "
6542 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006543 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006544 Py_DECREF(v);
6545 return NULL;
6546 }
6547 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006548
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006549 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006553PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006554 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555\n\
6556Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject*
6560unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6561{
6562 Py_UNICODE *e;
6563 Py_UNICODE *p;
6564 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006565 Py_UNICODE *qe;
6566 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 PyUnicodeObject *u;
6568 int tabsize = 8;
6569
6570 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Thomas Wouters7e474022000-07-16 12:04:32 +00006573 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006574 i = 0; /* chars up to and including most recent \n or \r */
6575 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6576 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 for (p = self->str; p < e; p++)
6578 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006579 if (tabsize > 0) {
6580 incr = tabsize - (j % tabsize); /* cannot overflow */
6581 if (j > PY_SSIZE_T_MAX - incr)
6582 goto overflow1;
6583 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006584 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006587 if (j > PY_SSIZE_T_MAX - 1)
6588 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 j++;
6590 if (*p == '\n' || *p == '\r') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006591 if (i > PY_SSIZE_T_MAX - j)
6592 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006594 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 }
6596 }
6597
Guido van Rossum5bdff602008-03-11 21:18:06 +00006598 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006599 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 /* Second pass: create output string and fill it */
6602 u = _PyUnicode_New(i + j);
6603 if (!u)
6604 return NULL;
6605
Guido van Rossum5bdff602008-03-11 21:18:06 +00006606 j = 0; /* same as in first pass */
6607 q = u->str; /* next output char */
6608 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610 for (p = self->str; p < e; p++)
6611 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006612 if (tabsize > 0) {
6613 i = tabsize - (j % tabsize);
6614 j += i;
6615 while (i--) {
6616 if (q >= qe)
6617 goto overflow2;
6618 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006619 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006620 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006621 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006622 else {
6623 if (q >= qe)
6624 goto overflow2;
6625 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006626 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 if (*p == '\n' || *p == '\r')
6628 j = 0;
6629 }
6630
6631 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006632
6633 overflow2:
6634 Py_DECREF(u);
6635 overflow1:
6636 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638}
6639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640PyDoc_STRVAR(find__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006641 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642\n\
6643Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006644such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645arguments start and end are interpreted as in slice notation.\n\
6646\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
6649static PyObject *
6650unicode_find(PyUnicodeObject *self, PyObject *args)
6651{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006652 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006653 Py_ssize_t start;
6654 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006655 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
Facundo Batista57d56692007-11-16 18:04:14 +00006657 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006660 result = stringlib_find_slice(
6661 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6662 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6663 start, end
6664 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
6666 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006667
6668 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669}
6670
6671static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
6674 if (index < 0 || index >= self->length) {
6675 PyErr_SetString(PyExc_IndexError, "string index out of range");
6676 return NULL;
6677 }
6678
6679 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6680}
6681
6682static long
6683unicode_hash(PyUnicodeObject *self)
6684{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006685 /* Since Unicode objects compare equal to their ASCII string
6686 counterparts, they should use the individual character values
6687 as basis for their hash value. This is needed to assure that
6688 strings and Unicode objects behave in the same way as
6689 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006692 register Py_UNICODE *p;
6693 register long x;
6694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (self->hash != -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006696 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006697 len = PyUnicode_GET_SIZE(self);
6698 p = PyUnicode_AS_UNICODE(self);
6699 x = *p << 7;
6700 while (--len >= 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006701 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006702 x ^= PyUnicode_GET_SIZE(self);
6703 if (x == -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006704 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006705 self->hash = x;
6706 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(index__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006710 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject *
6715unicode_index(PyUnicodeObject *self, PyObject *args)
6716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006717 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006718 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006719 Py_ssize_t start;
6720 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
Facundo Batista57d56692007-11-16 18:04:14 +00006722 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006725 result = stringlib_find_slice(
6726 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6727 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6728 start, end
6729 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
6731 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 if (result < 0) {
6734 PyErr_SetString(PyExc_ValueError, "substring not found");
6735 return NULL;
6736 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006737
Martin v. Löwis18e16552006-02-15 17:27:45 +00006738 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741PyDoc_STRVAR(islower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006742 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006748unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
6750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6751 register const Py_UNICODE *e;
6752 int cased;
6753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 /* Shortcut for single character strings */
6755 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006756 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006761
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 e = p + PyUnicode_GET_SIZE(self);
6763 cased = 0;
6764 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006765 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006766
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006767 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6768 return PyBool_FromLong(0);
6769 else if (!cased && Py_UNICODE_ISLOWER(ch))
6770 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006776 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006778Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006779at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
6781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785 register const Py_UNICODE *e;
6786 int cased;
6787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 /* Shortcut for single character strings */
6789 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006790 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006792 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006793 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006794 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006795
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 e = p + PyUnicode_GET_SIZE(self);
6797 cased = 0;
6798 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006799 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006800
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006801 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6802 return PyBool_FromLong(0);
6803 else if (!cased && Py_UNICODE_ISUPPER(ch))
6804 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807}
6808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006810 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006812Return True if S is a titlecased string and there is at least one\n\
6813character in S, i.e. upper- and titlecase characters may only\n\
6814follow uncased characters and lowercase characters only cased ones.\n\
6815Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
6817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006818unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
6820 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6821 register const Py_UNICODE *e;
6822 int cased, previous_is_cased;
6823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 /* Shortcut for single character strings */
6825 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006826 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6827 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006829 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006830 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006831 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006832
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 e = p + PyUnicode_GET_SIZE(self);
6834 cased = 0;
6835 previous_is_cased = 0;
6836 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006837 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006838
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006839 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6840 if (previous_is_cased)
6841 return PyBool_FromLong(0);
6842 previous_is_cased = 1;
6843 cased = 1;
6844 }
6845 else if (Py_UNICODE_ISLOWER(ch)) {
6846 if (!previous_is_cased)
6847 return PyBool_FromLong(0);
6848 previous_is_cased = 1;
6849 cased = 1;
6850 }
6851 else
6852 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006858 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006860Return True if all characters in S are whitespace\n\
6861and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006871 Py_UNICODE_ISSPACE(*p))
6872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006880 if (!Py_UNICODE_ISSPACE(*p))
6881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006887 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006889Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006894{
6895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6896 register const Py_UNICODE *e;
6897
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898 /* Shortcut for single character strings */
6899 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006900 Py_UNICODE_ISALPHA(*p))
6901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902
6903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006904 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006905 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906
6907 e = p + PyUnicode_GET_SIZE(self);
6908 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006909 if (!Py_UNICODE_ISALPHA(*p))
6910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913}
6914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006915PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006916 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006918Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006920
6921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006922unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006923{
6924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6925 register const Py_UNICODE *e;
6926
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006927 /* Shortcut for single character strings */
6928 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006929 Py_UNICODE_ISALNUM(*p))
6930 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006931
6932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006933 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006934 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006935
6936 e = p + PyUnicode_GET_SIZE(self);
6937 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006938 if (!Py_UNICODE_ISALNUM(*p))
6939 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006942}
6943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006944PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006945 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006947Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006951unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952{
6953 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6954 register const Py_UNICODE *e;
6955
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 /* Shortcut for single character strings */
6957 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006958 Py_UNICODE_ISDECIMAL(*p))
6959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006961 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006962 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006963 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006964
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 e = p + PyUnicode_GET_SIZE(self);
6966 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006967 if (!Py_UNICODE_ISDECIMAL(*p))
6968 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971}
6972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006973PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006974 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006976Return True if all characters in S are digits\n\
6977and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006980unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
6982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6983 register const Py_UNICODE *e;
6984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 /* Shortcut for single character strings */
6986 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006987 Py_UNICODE_ISDIGIT(*p))
6988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006990 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006991 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006992 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 e = p + PyUnicode_GET_SIZE(self);
6995 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006996 if (!Py_UNICODE_ISDIGIT(*p))
6997 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006999 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000}
7001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007002PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007003 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007005Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007
7008static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007009unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
7011 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7012 register const Py_UNICODE *e;
7013
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 /* Shortcut for single character strings */
7015 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007016 Py_UNICODE_ISNUMERIC(*p))
7017 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007019 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007020 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007021 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007022
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 e = p + PyUnicode_GET_SIZE(self);
7024 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007025 if (!Py_UNICODE_ISNUMERIC(*p))
7026 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029}
7030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031PyDoc_STRVAR(join__doc__,
Georg Brandl5d2eb342009-10-27 15:08:27 +00007032 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033\n\
7034Return a string which is the concatenation of the strings in the\n\
Georg Brandl5d2eb342009-10-27 15:08:27 +00007035iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036
7037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007038unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
Martin v. Löwis18e16552006-02-15 17:27:45 +00007043static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044unicode_length(PyUnicodeObject *self)
7045{
7046 return self->length;
7047}
7048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007049PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007050 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007052Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007053done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
7055static PyObject *
7056unicode_ljust(PyUnicodeObject *self, PyObject *args)
7057{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007058 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007059 Py_UNICODE fillchar = ' ';
7060
Martin v. Löwis412fb672006-04-13 06:34:32 +00007061 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 return NULL;
7063
Tim Peters7a29bd52001-09-12 03:03:31 +00007064 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 Py_INCREF(self);
7066 return (PyObject*) self;
7067 }
7068
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007069 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070}
7071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072PyDoc_STRVAR(lower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007073 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007075Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007078unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 return fixup(self, fixlower);
7081}
7082
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083#define LEFTSTRIP 0
7084#define RIGHTSTRIP 1
7085#define BOTHSTRIP 2
7086
7087/* Arrays indexed by above */
7088static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7089
7090#define STRIPNAME(i) (stripformat[i]+3)
7091
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092/* externally visible for str.strip(unicode) */
7093PyObject *
7094_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7095{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007096 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7097 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7098 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7099 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7100 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007102 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007103
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007104 i = 0;
7105 if (striptype != RIGHTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007106 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7107 i++;
7108 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007109 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007111 j = len;
7112 if (striptype != LEFTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007113 do {
7114 j--;
7115 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7116 j++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007117 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007119 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007120 Py_INCREF(self);
7121 return (PyObject*)self;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007122 }
7123 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007124 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125}
7126
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
7128static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007129do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007131 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7132 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007134 i = 0;
7135 if (striptype != RIGHTSTRIP) {
7136 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7137 i++;
7138 }
7139 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007140
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007141 j = len;
7142 if (striptype != LEFTSTRIP) {
7143 do {
7144 j--;
7145 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7146 j++;
7147 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007149 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7150 Py_INCREF(self);
7151 return (PyObject*)self;
7152 }
7153 else
7154 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155}
7156
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157
7158static PyObject *
7159do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7160{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007161 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007162
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007163 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7164 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007165
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007166 if (sep != NULL && sep != Py_None) {
7167 if (PyUnicode_Check(sep))
7168 return _PyUnicode_XStrip(self, striptype, sep);
7169 else if (PyString_Check(sep)) {
7170 PyObject *res;
7171 sep = PyUnicode_FromObject(sep);
7172 if (sep==NULL)
7173 return NULL;
7174 res = _PyUnicode_XStrip(self, striptype, sep);
7175 Py_DECREF(sep);
7176 return res;
7177 }
7178 else {
7179 PyErr_Format(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007180 "%s arg must be None, unicode or str",
7181 STRIPNAME(striptype));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007182 return NULL;
7183 }
7184 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007186 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187}
7188
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(strip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007191 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007192\n\
7193Return a copy of the string S with leading and trailing\n\
7194whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007195If chars is given and not None, remove characters in chars instead.\n\
7196If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007197
7198static PyObject *
7199unicode_strip(PyUnicodeObject *self, PyObject *args)
7200{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007201 if (PyTuple_GET_SIZE(args) == 0)
7202 return do_strip(self, BOTHSTRIP); /* Common case */
7203 else
7204 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007205}
7206
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007209 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007210\n\
7211Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007212If chars is given and not None, remove characters in chars instead.\n\
7213If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007214
7215static PyObject *
7216unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7217{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007218 if (PyTuple_GET_SIZE(args) == 0)
7219 return do_strip(self, LEFTSTRIP); /* Common case */
7220 else
7221 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222}
7223
7224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007225PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007226 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007227\n\
7228Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007229If chars is given and not None, remove characters in chars instead.\n\
7230If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007231
7232static PyObject *
7233unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7234{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007235 if (PyTuple_GET_SIZE(args) == 0)
7236 return do_strip(self, RIGHTSTRIP); /* Common case */
7237 else
7238 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007239}
7240
7241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244{
7245 PyUnicodeObject *u;
7246 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007247 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007248 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250 if (len < 0)
7251 len = 0;
7252
Tim Peters7a29bd52001-09-12 03:03:31 +00007253 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 /* no repeat, return original string */
7255 Py_INCREF(str);
7256 return (PyObject*) str;
7257 }
Tim Peters8f422462000-09-09 06:13:41 +00007258
7259 /* ensure # of chars needed doesn't overflow int and # of bytes
7260 * needed doesn't overflow size_t
7261 */
7262 nchars = len * str->length;
7263 if (len && nchars / len != str->length) {
7264 PyErr_SetString(PyExc_OverflowError,
7265 "repeated string is too long");
7266 return NULL;
7267 }
7268 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7269 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7270 PyErr_SetString(PyExc_OverflowError,
7271 "repeated string is too long");
7272 return NULL;
7273 }
7274 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 if (!u)
7276 return NULL;
7277
7278 p = u->str;
7279
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007280 if (str->length == 1 && len > 0) {
7281 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007282 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007283 Py_ssize_t done = 0; /* number of characters copied this far */
7284 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007285 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007286 done = str->length;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007287 }
7288 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007289 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007290 Py_UNICODE_COPY(p+done, p, n);
7291 done += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007292 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295 return (PyObject*) u;
7296}
7297
7298PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007299 PyObject *subobj,
7300 PyObject *replobj,
7301 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302{
7303 PyObject *self;
7304 PyObject *str1;
7305 PyObject *str2;
7306 PyObject *result;
7307
7308 self = PyUnicode_FromObject(obj);
7309 if (self == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 str1 = PyUnicode_FromObject(subobj);
7312 if (str1 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007313 Py_DECREF(self);
7314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 }
7316 str2 = PyUnicode_FromObject(replobj);
7317 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007318 Py_DECREF(self);
7319 Py_DECREF(str1);
7320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 }
Tim Petersced69f82003-09-16 20:30:58 +00007322 result = replace((PyUnicodeObject *)self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007323 (PyUnicodeObject *)str1,
7324 (PyUnicodeObject *)str2,
7325 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 Py_DECREF(self);
7327 Py_DECREF(str1);
7328 Py_DECREF(str2);
7329 return result;
7330}
7331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007332PyDoc_STRVAR(replace__doc__,
Ezio Melotti6327bf12010-06-26 18:47:01 +00007333 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334\n\
7335Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007336old replaced by new. If the optional argument count is\n\
7337given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
7339static PyObject*
7340unicode_replace(PyUnicodeObject *self, PyObject *args)
7341{
7342 PyUnicodeObject *str1;
7343 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007344 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 PyObject *result;
7346
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 return NULL;
7349 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7350 if (str1 == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007353 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007354 Py_DECREF(str1);
7355 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
7358 result = replace(self, str1, str2, maxcount);
7359
7360 Py_DECREF(str1);
7361 Py_DECREF(str2);
7362 return result;
7363}
7364
7365static
7366PyObject *unicode_repr(PyObject *unicode)
7367{
7368 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007369 PyUnicode_GET_SIZE(unicode),
7370 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007374 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375\n\
7376Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007377such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378arguments start and end are interpreted as in slice notation.\n\
7379\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382static PyObject *
7383unicode_rfind(PyUnicodeObject *self, PyObject *args)
7384{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007385 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007386 Py_ssize_t start;
7387 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007388 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Facundo Batista57d56692007-11-16 18:04:14 +00007390 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007393 result = stringlib_rfind_slice(
7394 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7395 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7396 start, end
7397 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
7399 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007400
7401 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007405 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007407Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
7409static PyObject *
7410unicode_rindex(PyUnicodeObject *self, PyObject *args)
7411{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007412 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007413 Py_ssize_t start;
7414 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007415 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416
Facundo Batista57d56692007-11-16 18:04:14 +00007417 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007420 result = stringlib_rfind_slice(
7421 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7422 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7423 start, end
7424 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
7426 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007427
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 if (result < 0) {
7429 PyErr_SetString(PyExc_ValueError, "substring not found");
7430 return NULL;
7431 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007432 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433}
7434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007436 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007438Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007439done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
7441static PyObject *
7442unicode_rjust(PyUnicodeObject *self, PyObject *args)
7443{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007444 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007445 Py_UNICODE fillchar = ' ';
7446
Martin v. Löwis412fb672006-04-13 06:34:32 +00007447 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 return NULL;
7449
Tim Peters7a29bd52001-09-12 03:03:31 +00007450 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 Py_INCREF(self);
7452 return (PyObject*) self;
7453 }
7454
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007455 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460{
7461 /* standard clamping */
7462 if (start < 0)
7463 start = 0;
7464 if (end < 0)
7465 end = 0;
7466 if (end > self->length)
7467 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007468 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 /* full slice, return original string */
7470 Py_INCREF(self);
7471 return (PyObject*) self;
7472 }
7473 if (start > end)
7474 start = end;
7475 /* copy slice */
7476 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007477 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478}
7479
7480PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007481 PyObject *sep,
7482 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
7484 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007485
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 s = PyUnicode_FromObject(s);
7487 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007488 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007489 if (sep != NULL) {
7490 sep = PyUnicode_FromObject(sep);
7491 if (sep == NULL) {
7492 Py_DECREF(s);
7493 return NULL;
7494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
7496
7497 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7498
7499 Py_DECREF(s);
7500 Py_XDECREF(sep);
7501 return result;
7502}
7503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504PyDoc_STRVAR(split__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007505 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506\n\
7507Return a list of the words in S, using sep as the\n\
7508delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007509splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007510whitespace string is a separator and empty strings are\n\
7511removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513static PyObject*
7514unicode_split(PyUnicodeObject *self, PyObject *args)
7515{
7516 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007517 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
Martin v. Löwis18e16552006-02-15 17:27:45 +00007519 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 return NULL;
7521
7522 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007523 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007525 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007527 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528}
7529
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007530PyObject *
7531PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7532{
7533 PyObject* str_obj;
7534 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007535 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007536
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007537 str_obj = PyUnicode_FromObject(str_in);
7538 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007539 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007540 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007541 if (!sep_obj) {
7542 Py_DECREF(str_obj);
7543 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007544 }
7545
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007546 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007547 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7548 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7549 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007550
Fredrik Lundhb9479482006-05-26 17:22:38 +00007551 Py_DECREF(sep_obj);
7552 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553
7554 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555}
7556
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007557
7558PyObject *
7559PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7560{
7561 PyObject* str_obj;
7562 PyObject* sep_obj;
7563 PyObject* out;
7564
7565 str_obj = PyUnicode_FromObject(str_in);
7566 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007567 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007568 sep_obj = PyUnicode_FromObject(sep_in);
7569 if (!sep_obj) {
7570 Py_DECREF(str_obj);
7571 return NULL;
7572 }
7573
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007574 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007575 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7576 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7577 );
7578
7579 Py_DECREF(sep_obj);
7580 Py_DECREF(str_obj);
7581
7582 return out;
7583}
7584
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007585PyDoc_STRVAR(partition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007586 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007587\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007588Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007589the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007590found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007591
7592static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007593unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007594{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007595 return PyUnicode_Partition((PyObject *)self, separator);
7596}
7597
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007598PyDoc_STRVAR(rpartition__doc__,
Ezio Melottidabb5f72010-01-25 11:46:11 +00007599 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007600\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007601Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007602the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007603separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007604
7605static PyObject*
7606unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7607{
7608 return PyUnicode_RPartition((PyObject *)self, separator);
7609}
7610
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007611PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007612 PyObject *sep,
7613 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007614{
7615 PyObject *result;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007616
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007617 s = PyUnicode_FromObject(s);
7618 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007619 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007620 if (sep != NULL) {
7621 sep = PyUnicode_FromObject(sep);
7622 if (sep == NULL) {
7623 Py_DECREF(s);
7624 return NULL;
7625 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007626 }
7627
7628 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7629
7630 Py_DECREF(s);
7631 Py_XDECREF(sep);
7632 return result;
7633}
7634
7635PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007636 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637\n\
7638Return a list of the words in S, using sep as the\n\
7639delimiter string, starting at the end of the string and\n\
7640working to the front. If maxsplit is given, at most maxsplit\n\
7641splits are done. If sep is not specified, any whitespace string\n\
7642is a separator.");
7643
7644static PyObject*
7645unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7646{
7647 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007648 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007649
Martin v. Löwis18e16552006-02-15 17:27:45 +00007650 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007651 return NULL;
7652
7653 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007654 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007655 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007656 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007657 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007658 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007659}
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007662 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663\n\
7664Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007665Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007666is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668static PyObject*
7669unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7670{
Guido van Rossum86662912000-04-11 15:38:46 +00007671 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Guido van Rossum86662912000-04-11 15:38:46 +00007673 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 return NULL;
7675
Guido van Rossum86662912000-04-11 15:38:46 +00007676 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677}
7678
7679static
7680PyObject *unicode_str(PyUnicodeObject *self)
7681{
Fred Drakee4315f52000-05-09 19:53:39 +00007682 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007686 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
7688Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007692unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 return fixup(self, fixswapcase);
7695}
7696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007697PyDoc_STRVAR(translate__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007698 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699\n\
7700Return a copy of the string S, where all characters have been mapped\n\
7701through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007702Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7703Unmapped characters are left untouched. Characters mapped to None\n\
7704are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007707unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708{
Tim Petersced69f82003-09-16 20:30:58 +00007709 return PyUnicode_TranslateCharmap(self->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007710 self->length,
7711 table,
7712 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713}
7714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007715PyDoc_STRVAR(upper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007716 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 return fixup(self, fixupper);
7724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007727 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728\n\
Georg Brandl98064072008-09-09 19:26:00 +00007729Pad a numeric string S with zeros on the left, to fill a field\n\
7730of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732static PyObject *
7733unicode_zfill(PyUnicodeObject *self, PyObject *args)
7734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 PyUnicodeObject *u;
7737
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 Py_ssize_t width;
7739 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 return NULL;
7741
7742 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007743 if (PyUnicode_CheckExact(self)) {
7744 Py_INCREF(self);
7745 return (PyObject*) self;
7746 }
7747 else
7748 return PyUnicode_FromUnicode(
7749 PyUnicode_AS_UNICODE(self),
7750 PyUnicode_GET_SIZE(self)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007751 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 }
7753
7754 fill = width - self->length;
7755
7756 u = pad(self, fill, 0, '0');
7757
Walter Dörwald068325e2002-04-15 13:36:47 +00007758 if (u == NULL)
7759 return NULL;
7760
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 if (u->str[fill] == '+' || u->str[fill] == '-') {
7762 /* move sign to beginning of string */
7763 u->str[0] = u->str[fill];
7764 u->str[fill] = '0';
7765 }
7766
7767 return (PyObject*) u;
7768}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
7770#if 0
7771static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007772free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007774 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775}
7776#endif
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007779 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007781Return True if S starts with the specified prefix, False otherwise.\n\
7782With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007783With optional end, stop comparing S at that position.\n\
7784prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject *
7787unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007788 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Georg Brandl24250812006-06-09 18:45:48 +00007790 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007792 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007793 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007794 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
Georg Brandl24250812006-06-09 18:45:48 +00007796 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007797 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7798 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007799 if (PyTuple_Check(subobj)) {
7800 Py_ssize_t i;
7801 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7802 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007803 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007804 if (substring == NULL)
7805 return NULL;
7806 result = tailmatch(self, substring, start, end, -1);
7807 Py_DECREF(substring);
7808 if (result) {
7809 Py_RETURN_TRUE;
7810 }
7811 }
7812 /* nothing matched */
7813 Py_RETURN_FALSE;
7814 }
7815 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007817 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007818 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007820 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821}
7822
7823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007824PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007825 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007827Return True if S ends with the specified suffix, False otherwise.\n\
7828With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007829With optional end, stop comparing S at that position.\n\
7830suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831
7832static PyObject *
7833unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007834 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
Georg Brandl24250812006-06-09 18:45:48 +00007836 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007838 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007839 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007840 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841
Georg Brandl24250812006-06-09 18:45:48 +00007842 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007843 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7844 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007845 if (PyTuple_Check(subobj)) {
7846 Py_ssize_t i;
7847 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7848 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007849 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007850 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007851 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007852 result = tailmatch(self, substring, start, end, +1);
7853 Py_DECREF(substring);
7854 if (result) {
7855 Py_RETURN_TRUE;
7856 }
7857 }
7858 Py_RETURN_FALSE;
7859 }
7860 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
Georg Brandl24250812006-06-09 18:45:48 +00007864 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007866 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867}
7868
7869
Eric Smitha9f7d622008-02-17 19:46:49 +00007870/* Implements do_string_format, which is unicode because of stringlib */
7871#include "stringlib/string_format.h"
7872
7873PyDoc_STRVAR(format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007874 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007875\n\
7876");
7877
Eric Smithdc13b792008-05-30 18:10:04 +00007878static PyObject *
7879unicode__format__(PyObject *self, PyObject *args)
7880{
7881 PyObject *format_spec;
7882 PyObject *result = NULL;
7883 PyObject *tmp = NULL;
7884
7885 /* If 2.x, convert format_spec to the same type as value */
7886 /* This is to allow things like u''.format('') */
7887 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7888 goto done;
7889 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7890 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007891 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007892 goto done;
7893 }
7894 tmp = PyObject_Unicode(format_spec);
7895 if (tmp == NULL)
7896 goto done;
7897 format_spec = tmp;
7898
7899 result = _PyUnicode_FormatAdvanced(self,
7900 PyUnicode_AS_UNICODE(format_spec),
7901 PyUnicode_GET_SIZE(format_spec));
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007902 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007903 Py_XDECREF(tmp);
7904 return result;
7905}
7906
Eric Smitha9f7d622008-02-17 19:46:49 +00007907PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007908 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007909\n\
7910");
7911
Robert Schuppenies901c9972008-06-10 10:10:31 +00007912static PyObject *
7913unicode__sizeof__(PyUnicodeObject *v)
7914{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007915 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7916 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007917}
7918
7919PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007920 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007921\n\
7922");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007923
7924static PyObject *
7925unicode_getnewargs(PyUnicodeObject *v)
7926{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007927 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007928}
7929
7930
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931static PyMethodDef unicode_methods[] = {
7932
7933 /* Order is according to common usage: often used methods should
7934 appear first, since lookup is done sequentially. */
7935
Georg Brandlecdc0a92006-03-30 12:19:07 +00007936 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007937 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7938 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007939 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007940 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7941 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7942 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7943 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7944 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7945 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7946 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007947 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007948 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7949 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7950 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007951 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007952 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007953/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7954 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7955 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7956 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007957 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007958 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007959 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007960 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007961 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7962 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7963 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7964 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7965 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7966 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7967 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7968 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7969 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7970 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7971 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7972 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7973 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7974 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007975 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007976 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7977 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7978 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7979 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007980 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007981#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983#endif
7984
7985#if 0
7986 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007987 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988#endif
7989
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007990 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 {NULL, NULL}
7992};
7993
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007994static PyObject *
7995unicode_mod(PyObject *v, PyObject *w)
7996{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007997 if (!PyUnicode_Check(v)) {
7998 Py_INCREF(Py_NotImplemented);
7999 return Py_NotImplemented;
8000 }
8001 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008002}
8003
8004static PyNumberMethods unicode_as_number = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008005 0, /*nb_add*/
8006 0, /*nb_subtract*/
8007 0, /*nb_multiply*/
8008 0, /*nb_divide*/
8009 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008010};
8011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008013 (lenfunc) unicode_length, /* sq_length */
8014 PyUnicode_Concat, /* sq_concat */
8015 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8016 (ssizeargfunc) unicode_getitem, /* sq_item */
8017 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8018 0, /* sq_ass_item */
8019 0, /* sq_ass_slice */
8020 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021};
8022
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008023static PyObject*
8024unicode_subscript(PyUnicodeObject* self, PyObject* item)
8025{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008026 if (PyIndex_Check(item)) {
8027 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008028 if (i == -1 && PyErr_Occurred())
8029 return NULL;
8030 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008031 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008032 return unicode_getitem(self, i);
8033 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008035 Py_UNICODE* source_buf;
8036 Py_UNICODE* result_buf;
8037 PyObject* result;
8038
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008039 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008040 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008041 return NULL;
8042 }
8043
8044 if (slicelength <= 0) {
8045 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008046 } else if (start == 0 && step == 1 && slicelength == self->length &&
8047 PyUnicode_CheckExact(self)) {
8048 Py_INCREF(self);
8049 return (PyObject *)self;
8050 } else if (step == 1) {
8051 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008052 } else {
8053 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008054 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8055 sizeof(Py_UNICODE));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008056
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008057 if (result_buf == NULL)
8058 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008059
8060 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8061 result_buf[i] = source_buf[cur];
8062 }
Tim Petersced69f82003-09-16 20:30:58 +00008063
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008064 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008065 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008066 return result;
8067 }
8068 } else {
8069 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8070 return NULL;
8071 }
8072}
8073
8074static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008075 (lenfunc)unicode_length, /* mp_length */
8076 (binaryfunc)unicode_subscript, /* mp_subscript */
8077 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008078};
8079
Martin v. Löwis18e16552006-02-15 17:27:45 +00008080static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008082 Py_ssize_t index,
8083 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084{
8085 if (index != 0) {
8086 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008087 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 return -1;
8089 }
8090 *ptr = (void *) self->str;
8091 return PyUnicode_GET_DATA_SIZE(self);
8092}
8093
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094static Py_ssize_t
8095unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008096 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097{
8098 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008099 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 return -1;
8101}
8102
8103static int
8104unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008105 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
8107 if (lenp)
8108 *lenp = PyUnicode_GET_DATA_SIZE(self);
8109 return 1;
8110}
8111
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008112static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008114 Py_ssize_t index,
8115 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116{
8117 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008118
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 if (index != 0) {
8120 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008121 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 return -1;
8123 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008124 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008126 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008127 *ptr = (void *) PyString_AS_STRING(str);
8128 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129}
8130
8131/* Helpers for PyUnicode_Format() */
8132
8133static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008134getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008136 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 if (argidx < arglen) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008138 (*p_argidx)++;
8139 if (arglen < 0)
8140 return args;
8141 else
8142 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 }
8144 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008145 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 return NULL;
8147}
8148
8149#define F_LJUST (1<<0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008150#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151#define F_BLANK (1<<2)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008152#define F_ALT (1<<3)
8153#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154
Martin v. Löwis18e16552006-02-15 17:27:45 +00008155static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008156strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 register Py_ssize_t i;
8159 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 for (i = len - 1; i >= 0; i--)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008161 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 return len;
8164}
8165
Neal Norwitzfc76d632006-01-10 06:03:13 +00008166static int
8167doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8168{
Tim Peters15231542006-02-16 01:08:01 +00008169 Py_ssize_t result;
8170
Neal Norwitzfc76d632006-01-10 06:03:13 +00008171 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008172 result = strtounicode(buffer, (char *)buffer);
8173 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008174}
8175
8176static int
8177longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8178{
Tim Peters15231542006-02-16 01:08:01 +00008179 Py_ssize_t result;
8180
Neal Norwitzfc76d632006-01-10 06:03:13 +00008181 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008182 result = strtounicode(buffer, (char *)buffer);
8183 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008184}
8185
Guido van Rossum078151d2002-08-11 04:24:12 +00008186/* XXX To save some code duplication, formatfloat/long/int could have been
8187 shared with stringobject.c, converting from 8-bit to Unicode after the
8188 formatting is done. */
8189
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190static int
8191formatfloat(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008192 size_t buflen,
8193 int flags,
8194 int prec,
8195 int type,
8196 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008198 /* fmt = '%#.' + `prec` + `type`
8199 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 char fmt[20];
8201 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008202
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 x = PyFloat_AsDouble(v);
8204 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008205 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (prec < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008207 prec = 6;
Mark Dickinson75be68b2009-08-28 20:57:42 +00008208#if SIZEOF_INT > 4
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008209 /* make sure that the decimal representation of precision really does
8210 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
Mark Dickinson75be68b2009-08-28 20:57:42 +00008211 if (prec > 0x7fffffff) {
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008212 PyErr_SetString(PyExc_OverflowError,
8213 "outrageously large precision "
8214 "for formatted float");
8215 return -1;
8216 }
Mark Dickinson75be68b2009-08-28 20:57:42 +00008217#endif
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008218
Mark Dickinsona30f3492009-03-29 15:06:29 +00008219 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008220 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008221 /* Worst case length calc to ensure no buffer overrun:
8222
8223 'g' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008224 fmt = %#.<prec>g
8225 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8226 for any double rep.)
8227 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008228
8229 'f' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008230 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8231 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008232
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008233 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008234 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008235
8236 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008237 if (((type == 'g' || type == 'G') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008238 buflen <= (size_t)10 + (size_t)prec) ||
8239 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8240 PyErr_SetString(PyExc_OverflowError,
8241 "formatted float is too long (precision too large?)");
8242 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008244 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008245 (flags&F_ALT) ? "#" : "",
8246 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008247 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248}
8249
Tim Peters38fd5b62000-09-21 05:43:11 +00008250static PyObject*
8251formatlong(PyObject *val, int flags, int prec, int type)
8252{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008253 char *buf;
8254 int i, len;
8255 PyObject *str; /* temporary string object. */
8256 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008257
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008258 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8259 if (!str)
8260 return NULL;
8261 result = _PyUnicode_New(len);
8262 if (!result) {
8263 Py_DECREF(str);
8264 return NULL;
8265 }
8266 for (i = 0; i < len; i++)
8267 result->str[i] = buf[i];
8268 result->str[len] = 0;
8269 Py_DECREF(str);
8270 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008271}
8272
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273static int
8274formatint(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008275 size_t buflen,
8276 int flags,
8277 int prec,
8278 int type,
8279 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008281 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008282 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8283 * + 1 + 1
8284 * = 24
8285 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008286 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008287 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 long x;
8289
8290 x = PyInt_AsLong(v);
8291 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008292 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008293 if (x < 0 && type == 'u') {
8294 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008295 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008296 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8297 sign = "-";
8298 else
8299 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008301 prec = 1;
8302
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008303 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8304 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008307 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008308 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008309 return -1;
8310 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311
8312 if ((flags & F_ALT) &&
8313 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008314 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315 * of issues that cause pain:
8316 * - when 0 is being converted, the C standard leaves off
8317 * the '0x' or '0X', which is inconsistent with other
8318 * %#x/%#X conversions and inconsistent with Python's
8319 * hex() function
8320 * - there are platforms that violate the standard and
8321 * convert 0 with the '0x' or '0X'
8322 * (Metrowerks, Compaq Tru64)
8323 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008324 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008325 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008326 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008327 * We can achieve the desired consistency by inserting our
8328 * own '0x' or '0X' prefix, and substituting %x/%X in place
8329 * of %#x/%#X.
8330 *
8331 * Note that this is the same approach as used in
8332 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008333 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008334 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8335 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008336 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008337 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008338 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8339 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008340 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008341 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008342 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008343 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008344 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008345 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346}
8347
8348static int
8349formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008350 size_t buflen,
8351 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352{
Ezio Melotti85ddea72010-02-25 17:51:33 +00008353 PyObject *unistr;
8354 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008355 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008356 if (PyUnicode_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008357 if (PyUnicode_GET_SIZE(v) != 1)
8358 goto onError;
8359 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008362 else if (PyString_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008363 if (PyString_GET_SIZE(v) != 1)
8364 goto onError;
Ezio Melotti85ddea72010-02-25 17:51:33 +00008365 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8366 with a UnicodeDecodeError if 'char' is not decodable with the
8367 default encoding (usually ASCII, but it might be something else) */
8368 str = PyString_AS_STRING(v);
8369 if ((unsigned char)str[0] > 0x7F) {
8370 /* the char is not ASCII; try to decode the string using the
8371 default encoding and return -1 to let the UnicodeDecodeError
8372 be raised if the string can't be decoded */
8373 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8374 if (unistr == NULL)
8375 return -1;
8376 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8377 Py_DECREF(unistr);
8378 }
8379 else
8380 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382
8383 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008384 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 long x;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008386 x = PyInt_AsLong(v);
8387 if (x == -1 && PyErr_Occurred())
8388 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008389#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008390 if (x < 0 || x > 0x10ffff) {
8391 PyErr_SetString(PyExc_OverflowError,
8392 "%c arg not in range(0x110000) "
8393 "(wide Python build)");
8394 return -1;
8395 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008396#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008397 if (x < 0 || x > 0xffff) {
8398 PyErr_SetString(PyExc_OverflowError,
8399 "%c arg not in range(0x10000) "
8400 "(narrow Python build)");
8401 return -1;
8402 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008403#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008404 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 }
8406 buf[1] = '\0';
8407 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008408
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008409 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008410 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008411 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008412 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413}
8414
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8416
8417 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8418 chars are formatted. XXX This is a magic number. Each formatting
8419 routine does bounds checking to ensure no overflow, but a better
8420 solution may be to malloc a buffer of appropriate size for each
8421 format. For now, the current solution is sufficient.
8422*/
8423#define FORMATBUFLEN (size_t)120
8424
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008426 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427{
8428 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 int args_owned = 0;
8431 PyUnicodeObject *result = NULL;
8432 PyObject *dict = NULL;
8433 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008434
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (format == NULL || args == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008436 PyErr_BadInternalCall();
8437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 }
8439 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008440 if (uformat == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 fmt = PyUnicode_AS_UNICODE(uformat);
8443 fmtcnt = PyUnicode_GET_SIZE(uformat);
8444
8445 reslen = rescnt = fmtcnt + 100;
8446 result = _PyUnicode_New(reslen);
8447 if (result == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 res = PyUnicode_AS_UNICODE(result);
8450
8451 if (PyTuple_Check(args)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008452 arglen = PyTuple_Size(args);
8453 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 }
8455 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008456 arglen = -1;
8457 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 }
Christian Heimese93237d2007-12-19 02:37:44 +00008459 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008460 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008461 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
8463 while (--fmtcnt >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008464 if (*fmt != '%') {
8465 if (--rescnt < 0) {
8466 rescnt = fmtcnt + 100;
8467 reslen += rescnt;
8468 if (_PyUnicode_Resize(&result, reslen) < 0)
8469 goto onError;
8470 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8471 --rescnt;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008472 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008473 *res++ = *fmt++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008474 }
8475 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008476 /* Got a format specifier */
8477 int flags = 0;
8478 Py_ssize_t width = -1;
8479 int prec = -1;
8480 Py_UNICODE c = '\0';
8481 Py_UNICODE fill;
8482 int isnumok;
8483 PyObject *v = NULL;
8484 PyObject *temp = NULL;
8485 Py_UNICODE *pbuf;
8486 Py_UNICODE sign;
8487 Py_ssize_t len;
8488 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8489
8490 fmt++;
8491 if (*fmt == '(') {
8492 Py_UNICODE *keystart;
8493 Py_ssize_t keylen;
8494 PyObject *key;
8495 int pcount = 1;
8496
8497 if (dict == NULL) {
8498 PyErr_SetString(PyExc_TypeError,
8499 "format requires a mapping");
8500 goto onError;
8501 }
8502 ++fmt;
8503 --fmtcnt;
8504 keystart = fmt;
8505 /* Skip over balanced parentheses */
8506 while (pcount > 0 && --fmtcnt >= 0) {
8507 if (*fmt == ')')
8508 --pcount;
8509 else if (*fmt == '(')
8510 ++pcount;
8511 fmt++;
8512 }
8513 keylen = fmt - keystart - 1;
8514 if (fmtcnt < 0 || pcount > 0) {
8515 PyErr_SetString(PyExc_ValueError,
8516 "incomplete format key");
8517 goto onError;
8518 }
8519#if 0
8520 /* keys are converted to strings using UTF-8 and
8521 then looked up since Python uses strings to hold
8522 variables names etc. in its namespaces and we
8523 wouldn't want to break common idioms. */
8524 key = PyUnicode_EncodeUTF8(keystart,
8525 keylen,
8526 NULL);
8527#else
8528 key = PyUnicode_FromUnicode(keystart, keylen);
8529#endif
8530 if (key == NULL)
8531 goto onError;
8532 if (args_owned) {
8533 Py_DECREF(args);
8534 args_owned = 0;
8535 }
8536 args = PyObject_GetItem(dict, key);
8537 Py_DECREF(key);
8538 if (args == NULL) {
8539 goto onError;
8540 }
8541 args_owned = 1;
8542 arglen = -1;
8543 argidx = -2;
8544 }
8545 while (--fmtcnt >= 0) {
8546 switch (c = *fmt++) {
8547 case '-': flags |= F_LJUST; continue;
8548 case '+': flags |= F_SIGN; continue;
8549 case ' ': flags |= F_BLANK; continue;
8550 case '#': flags |= F_ALT; continue;
8551 case '0': flags |= F_ZERO; continue;
8552 }
8553 break;
8554 }
8555 if (c == '*') {
8556 v = getnextarg(args, arglen, &argidx);
8557 if (v == NULL)
8558 goto onError;
8559 if (!PyInt_Check(v)) {
8560 PyErr_SetString(PyExc_TypeError,
8561 "* wants int");
8562 goto onError;
8563 }
8564 width = PyInt_AsLong(v);
8565 if (width < 0) {
8566 flags |= F_LJUST;
8567 width = -width;
8568 }
8569 if (--fmtcnt >= 0)
8570 c = *fmt++;
8571 }
8572 else if (c >= '0' && c <= '9') {
8573 width = c - '0';
8574 while (--fmtcnt >= 0) {
8575 c = *fmt++;
8576 if (c < '0' || c > '9')
8577 break;
8578 if ((width*10) / 10 != width) {
8579 PyErr_SetString(PyExc_ValueError,
8580 "width too big");
8581 goto onError;
8582 }
8583 width = width*10 + (c - '0');
8584 }
8585 }
8586 if (c == '.') {
8587 prec = 0;
8588 if (--fmtcnt >= 0)
8589 c = *fmt++;
8590 if (c == '*') {
8591 v = getnextarg(args, arglen, &argidx);
8592 if (v == NULL)
8593 goto onError;
8594 if (!PyInt_Check(v)) {
8595 PyErr_SetString(PyExc_TypeError,
8596 "* wants int");
8597 goto onError;
8598 }
8599 prec = PyInt_AsLong(v);
8600 if (prec < 0)
8601 prec = 0;
8602 if (--fmtcnt >= 0)
8603 c = *fmt++;
8604 }
8605 else if (c >= '0' && c <= '9') {
8606 prec = c - '0';
8607 while (--fmtcnt >= 0) {
Stefan Krahae7dd8f2010-07-19 18:24:18 +00008608 c = *fmt++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008609 if (c < '0' || c > '9')
8610 break;
8611 if ((prec*10) / 10 != prec) {
8612 PyErr_SetString(PyExc_ValueError,
8613 "prec too big");
8614 goto onError;
8615 }
8616 prec = prec*10 + (c - '0');
8617 }
8618 }
8619 } /* prec */
8620 if (fmtcnt >= 0) {
8621 if (c == 'h' || c == 'l' || c == 'L') {
8622 if (--fmtcnt >= 0)
8623 c = *fmt++;
8624 }
8625 }
8626 if (fmtcnt < 0) {
8627 PyErr_SetString(PyExc_ValueError,
8628 "incomplete format");
8629 goto onError;
8630 }
8631 if (c != '%') {
8632 v = getnextarg(args, arglen, &argidx);
8633 if (v == NULL)
8634 goto onError;
8635 }
8636 sign = 0;
8637 fill = ' ';
8638 switch (c) {
8639
8640 case '%':
8641 pbuf = formatbuf;
8642 /* presume that buffer length is at least 1 */
8643 pbuf[0] = '%';
8644 len = 1;
8645 break;
8646
8647 case 's':
8648 case 'r':
Victor Stinner4fd2ff92010-03-22 12:56:39 +00008649 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008650 temp = v;
8651 Py_INCREF(temp);
8652 }
8653 else {
8654 PyObject *unicode;
8655 if (c == 's')
8656 temp = PyObject_Unicode(v);
8657 else
8658 temp = PyObject_Repr(v);
8659 if (temp == NULL)
8660 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008661 if (PyUnicode_Check(temp))
8662 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008663 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008664 /* convert to string to Unicode */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008665 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8666 PyString_GET_SIZE(temp),
8667 NULL,
8668 "strict");
8669 Py_DECREF(temp);
8670 temp = unicode;
8671 if (temp == NULL)
8672 goto onError;
8673 }
8674 else {
8675 Py_DECREF(temp);
8676 PyErr_SetString(PyExc_TypeError,
8677 "%s argument has non-string str()");
8678 goto onError;
8679 }
8680 }
8681 pbuf = PyUnicode_AS_UNICODE(temp);
8682 len = PyUnicode_GET_SIZE(temp);
8683 if (prec >= 0 && len > prec)
8684 len = prec;
8685 break;
8686
8687 case 'i':
8688 case 'd':
8689 case 'u':
8690 case 'o':
8691 case 'x':
8692 case 'X':
8693 if (c == 'i')
8694 c = 'd';
8695 isnumok = 0;
8696 if (PyNumber_Check(v)) {
8697 PyObject *iobj=NULL;
8698
8699 if (PyInt_Check(v) || (PyLong_Check(v))) {
8700 iobj = v;
8701 Py_INCREF(iobj);
8702 }
8703 else {
8704 iobj = PyNumber_Int(v);
8705 if (iobj==NULL) iobj = PyNumber_Long(v);
8706 }
8707 if (iobj!=NULL) {
8708 if (PyInt_Check(iobj)) {
8709 isnumok = 1;
8710 pbuf = formatbuf;
8711 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8712 flags, prec, c, iobj);
8713 Py_DECREF(iobj);
8714 if (len < 0)
8715 goto onError;
8716 sign = 1;
8717 }
8718 else if (PyLong_Check(iobj)) {
8719 isnumok = 1;
8720 temp = formatlong(iobj, flags, prec, c);
8721 Py_DECREF(iobj);
8722 if (!temp)
8723 goto onError;
8724 pbuf = PyUnicode_AS_UNICODE(temp);
8725 len = PyUnicode_GET_SIZE(temp);
8726 sign = 1;
8727 }
8728 else {
8729 Py_DECREF(iobj);
8730 }
8731 }
8732 }
8733 if (!isnumok) {
8734 PyErr_Format(PyExc_TypeError,
8735 "%%%c format: a number is required, "
8736 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8737 goto onError;
8738 }
8739 if (flags & F_ZERO)
8740 fill = '0';
8741 break;
8742
8743 case 'e':
8744 case 'E':
8745 case 'f':
8746 case 'F':
8747 case 'g':
8748 case 'G':
8749 if (c == 'F')
8750 c = 'f';
8751 pbuf = formatbuf;
8752 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8753 flags, prec, c, v);
8754 if (len < 0)
8755 goto onError;
8756 sign = 1;
8757 if (flags & F_ZERO)
8758 fill = '0';
8759 break;
8760
8761 case 'c':
8762 pbuf = formatbuf;
8763 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8764 if (len < 0)
8765 goto onError;
8766 break;
8767
8768 default:
8769 PyErr_Format(PyExc_ValueError,
8770 "unsupported format character '%c' (0x%x) "
8771 "at index %zd",
8772 (31<=c && c<=126) ? (char)c : '?',
8773 (int)c,
8774 (Py_ssize_t)(fmt - 1 -
8775 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008776 goto onError;
8777 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008778 if (sign) {
8779 if (*pbuf == '-' || *pbuf == '+') {
8780 sign = *pbuf++;
8781 len--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008782 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008783 else if (flags & F_SIGN)
8784 sign = '+';
8785 else if (flags & F_BLANK)
8786 sign = ' ';
8787 else
8788 sign = 0;
8789 }
8790 if (width < len)
8791 width = len;
8792 if (rescnt - (sign != 0) < width) {
8793 reslen -= rescnt;
8794 rescnt = width + fmtcnt + 100;
8795 reslen += rescnt;
8796 if (reslen < 0) {
8797 Py_XDECREF(temp);
8798 PyErr_NoMemory();
8799 goto onError;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008800 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008801 if (_PyUnicode_Resize(&result, reslen) < 0) {
8802 Py_XDECREF(temp);
8803 goto onError;
8804 }
8805 res = PyUnicode_AS_UNICODE(result)
8806 + reslen - rescnt;
8807 }
8808 if (sign) {
8809 if (fill != ' ')
8810 *res++ = sign;
8811 rescnt--;
8812 if (width > len)
8813 width--;
8814 }
8815 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8816 assert(pbuf[0] == '0');
8817 assert(pbuf[1] == c);
8818 if (fill != ' ') {
8819 *res++ = *pbuf++;
8820 *res++ = *pbuf++;
8821 }
8822 rescnt -= 2;
8823 width -= 2;
8824 if (width < 0)
8825 width = 0;
8826 len -= 2;
8827 }
8828 if (width > len && !(flags & F_LJUST)) {
8829 do {
8830 --rescnt;
8831 *res++ = fill;
8832 } while (--width > len);
8833 }
8834 if (fill == ' ') {
8835 if (sign)
8836 *res++ = sign;
8837 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8838 assert(pbuf[0] == '0');
8839 assert(pbuf[1] == c);
8840 *res++ = *pbuf++;
8841 *res++ = *pbuf++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008842 }
8843 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008844 Py_UNICODE_COPY(res, pbuf, len);
8845 res += len;
8846 rescnt -= len;
8847 while (--width >= len) {
8848 --rescnt;
8849 *res++ = ' ';
8850 }
8851 if (dict && (argidx < arglen) && c != '%') {
8852 PyErr_SetString(PyExc_TypeError,
8853 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008854 Py_XDECREF(temp);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008855 goto onError;
8856 }
8857 Py_XDECREF(temp);
8858 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 } /* until end */
8860 if (argidx < arglen && !dict) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008861 PyErr_SetString(PyExc_TypeError,
8862 "not all arguments converted during string formatting");
8863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 }
8865
Thomas Woutersa96affe2006-03-12 00:29:36 +00008866 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008869 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 }
8871 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 return (PyObject *)result;
8873
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 Py_XDECREF(result);
8876 Py_DECREF(uformat);
8877 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008878 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 }
8880 return NULL;
8881}
8882
8883static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008884 (readbufferproc) unicode_buffer_getreadbuf,
8885 (writebufferproc) unicode_buffer_getwritebuf,
8886 (segcountproc) unicode_buffer_getsegcount,
8887 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888};
8889
Jeremy Hylton938ace62002-07-17 16:30:39 +00008890static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008891unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8892
Tim Peters6d6c1a32001-08-02 04:15:00 +00008893static PyObject *
8894unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8895{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008896 PyObject *x = NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008897 static char *kwlist[] = {"string", "encoding", "errors", 0};
8898 char *encoding = NULL;
8899 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008900
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008901 if (type != &PyUnicode_Type)
8902 return unicode_subtype_new(type, args, kwds);
8903 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008904 kwlist, &x, &encoding, &errors))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008905 return NULL;
8906 if (x == NULL)
8907 return (PyObject *)_PyUnicode_New(0);
8908 if (encoding == NULL && errors == NULL)
8909 return PyObject_Unicode(x);
8910 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008911 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008912}
8913
Guido van Rossume023fe02001-08-30 03:12:59 +00008914static PyObject *
8915unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8916{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008917 PyUnicodeObject *tmp, *pnew;
8918 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008919
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008920 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8921 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8922 if (tmp == NULL)
8923 return NULL;
8924 assert(PyUnicode_Check(tmp));
8925 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8926 if (pnew == NULL) {
8927 Py_DECREF(tmp);
8928 return NULL;
8929 }
8930 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8931 if (pnew->str == NULL) {
8932 _Py_ForgetReference((PyObject *)pnew);
8933 PyObject_Del(pnew);
8934 Py_DECREF(tmp);
8935 return PyErr_NoMemory();
8936 }
8937 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8938 pnew->length = n;
8939 pnew->hash = tmp->hash;
8940 Py_DECREF(tmp);
8941 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008942}
8943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008944PyDoc_STRVAR(unicode_doc,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008945 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008946\n\
8947Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008948encoding defaults to the current default string encoding.\n\
8949errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008950
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008952 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008953 "unicode", /* tp_name */
8954 sizeof(PyUnicodeObject), /* tp_size */
8955 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 /* Slots */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008957 (destructor)unicode_dealloc, /* tp_dealloc */
8958 0, /* tp_print */
8959 0, /* tp_getattr */
8960 0, /* tp_setattr */
8961 0, /* tp_compare */
8962 unicode_repr, /* tp_repr */
8963 &unicode_as_number, /* tp_as_number */
8964 &unicode_as_sequence, /* tp_as_sequence */
8965 &unicode_as_mapping, /* tp_as_mapping */
8966 (hashfunc) unicode_hash, /* tp_hash*/
8967 0, /* tp_call*/
8968 (reprfunc) unicode_str, /* tp_str */
8969 PyObject_GenericGetAttr, /* tp_getattro */
8970 0, /* tp_setattro */
8971 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008972 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008973 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008974 unicode_doc, /* tp_doc */
8975 0, /* tp_traverse */
8976 0, /* tp_clear */
8977 PyUnicode_RichCompare, /* tp_richcompare */
8978 0, /* tp_weaklistoffset */
8979 0, /* tp_iter */
8980 0, /* tp_iternext */
8981 unicode_methods, /* tp_methods */
8982 0, /* tp_members */
8983 0, /* tp_getset */
8984 &PyBaseString_Type, /* tp_base */
8985 0, /* tp_dict */
8986 0, /* tp_descr_get */
8987 0, /* tp_descr_set */
8988 0, /* tp_dictoffset */
8989 0, /* tp_init */
8990 0, /* tp_alloc */
8991 unicode_new, /* tp_new */
8992 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993};
8994
8995/* Initialize the Unicode implementation */
8996
Thomas Wouters78890102000-07-22 19:25:51 +00008997void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008999 int i;
9000
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009001 /* XXX - move this array to unicodectype.c ? */
9002 Py_UNICODE linebreak[] = {
9003 0x000A, /* LINE FEED */
9004 0x000D, /* CARRIAGE RETURN */
9005 0x001C, /* FILE SEPARATOR */
9006 0x001D, /* GROUP SEPARATOR */
9007 0x001E, /* RECORD SEPARATOR */
9008 0x0085, /* NEXT LINE */
9009 0x2028, /* LINE SEPARATOR */
9010 0x2029, /* PARAGRAPH SEPARATOR */
9011 };
9012
Fred Drakee4315f52000-05-09 19:53:39 +00009013 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009014 free_list = NULL;
9015 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009017 if (!unicode_empty)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009018 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009019
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009020 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009021 for (i = 0; i < 256; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009022 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009023 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009024 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009025
9026 /* initialize the linebreak bloom filter */
9027 bloom_linebreak = make_bloom_mask(
9028 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9029 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009030
9031 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032}
9033
9034/* Finalize the Unicode implementation */
9035
Christian Heimes3b718a72008-02-14 12:47:33 +00009036int
9037PyUnicode_ClearFreeList(void)
9038{
9039 int freelist_size = numfree;
9040 PyUnicodeObject *u;
9041
9042 for (u = free_list; u != NULL;) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009043 PyUnicodeObject *v = u;
9044 u = *(PyUnicodeObject **)u;
9045 if (v->str)
9046 PyObject_DEL(v->str);
9047 Py_XDECREF(v->defenc);
9048 PyObject_Del(v);
9049 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009050 }
9051 free_list = NULL;
9052 assert(numfree == 0);
9053 return freelist_size;
9054}
9055
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056void
Thomas Wouters78890102000-07-22 19:25:51 +00009057_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009059 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009061 Py_XDECREF(unicode_empty);
9062 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009063
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009064 for (i = 0; i < 256; i++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009065 if (unicode_latin1[i]) {
9066 Py_DECREF(unicode_latin1[i]);
9067 unicode_latin1[i] = NULL;
9068 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009069 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009070 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009072
Anthony Baxterac6bd462006-04-13 02:06:09 +00009073#ifdef __cplusplus
9074}
9075#endif
9076
9077
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009078/*
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009079 Local variables:
9080 c-basic-offset: 4
9081 indent-tabs-mode: nil
9082 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009083*/