blob: 3e5f9bcaf964bdf8572d3e7b28d49259dbec21be [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson339f8c62009-01-31 22:25:08 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000247 if (unicode == unicode_empty ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Benjamin Peterson828a7062008-12-27 17:05:29 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Benjamin Peterson828a7062008-12-27 17:05:29 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Benjamin Peterson828a7062008-12-27 17:05:29 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000443 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000492 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534{
535 PyUnicodeObject *unicode;
536
537 if (w == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000538 PyErr_BadInternalCall();
539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540 }
541
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
545
546 /* Copy the wchar_t data into the new object */
547#ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000549#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000551 register Py_UNICODE *u;
552 register Py_ssize_t i;
553 u = PyUnicode_AS_UNICODE(unicode);
554 for (i = size; i > 0; i--)
555 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 }
557#endif
558
559 return (PyObject *)unicode;
560}
561
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000562static void
563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
564{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
570 }
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
579 }
580 *fmt++ = c;
581 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000582}
583
584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585
586PyObject *
587PyUnicode_FromFormatV(const char *format, va_list vargs)
588{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000609
610#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000611 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000612#else
613#ifdef __va_copy
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000614 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000616 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000617#endif
618#endif
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000619 /* step 1: count the number of %S/%R/%s format specifications
620 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
621 * objects once during step 3 and put the result in an array) */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000622 for (f = format; *f; f++) {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000623 if (*f == '%') {
624 if (*(f+1)=='%')
625 continue;
Walter Dörwald67032252009-05-03 22:46:50 +0000626 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000627 ++callcount;
628 while (isdigit((unsigned)*f))
629 width = (width*10) + *f++ - '0';
630 while (*++f && *f != '%' && !isalpha((unsigned)*f))
631 ;
632 if (*f == 's')
633 ++callcount;
634 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000635 }
636 /* step 2: allocate memory for the results of
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000637 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000638 if (callcount) {
639 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
640 if (!callresults) {
641 PyErr_NoMemory();
642 return NULL;
643 }
644 callresult = callresults;
645 }
646 /* step 3: figure out how large a buffer we need */
647 for (f = format; *f; f++) {
648 if (*f == '%') {
649 const char* p = f;
650 width = 0;
651 while (isdigit((unsigned)*f))
652 width = (width*10) + *f++ - '0';
653 while (*++f && *f != '%' && !isalpha((unsigned)*f))
654 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000655
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000656 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
657 * they don't affect the amount of space we reserve.
658 */
659 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000660 (f[1] == 'd' || f[1] == 'u'))
661 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000662
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000663 switch (*f) {
664 case 'c':
665 (void)va_arg(count, int);
666 /* fall through... */
667 case '%':
668 n++;
669 break;
670 case 'd': case 'u': case 'i': case 'x':
671 (void) va_arg(count, int);
672 /* 20 bytes is enough to hold a 64-bit
673 integer. Decimal takes the most space.
674 This isn't enough for octal.
675 If a width is specified we need more
676 (which we allocate later). */
677 if (width < 20)
678 width = 20;
679 n += width;
680 if (abuffersize < width)
681 abuffersize = width;
682 break;
683 case 's':
684 {
685 /* UTF-8 */
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000686 unsigned char *s = va_arg(count, unsigned char*);
687 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
688 if (!str)
689 goto fail;
690 n += PyUnicode_GET_SIZE(str);
691 /* Remember the str and switch to the next slot */
692 *callresult++ = str;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000693 break;
694 }
695 case 'U':
696 {
697 PyObject *obj = va_arg(count, PyObject *);
698 assert(obj && PyUnicode_Check(obj));
699 n += PyUnicode_GET_SIZE(obj);
700 break;
701 }
702 case 'V':
703 {
704 PyObject *obj = va_arg(count, PyObject *);
705 const char *str = va_arg(count, const char *);
706 assert(obj || str);
707 assert(!obj || PyUnicode_Check(obj));
708 if (obj)
709 n += PyUnicode_GET_SIZE(obj);
710 else
711 n += strlen(str);
712 break;
713 }
714 case 'S':
715 {
716 PyObject *obj = va_arg(count, PyObject *);
717 PyObject *str;
718 assert(obj);
719 str = PyObject_Str(obj);
720 if (!str)
721 goto fail;
722 n += PyUnicode_GET_SIZE(str);
723 /* Remember the str and switch to the next slot */
724 *callresult++ = str;
725 break;
726 }
727 case 'R':
728 {
729 PyObject *obj = va_arg(count, PyObject *);
730 PyObject *repr;
731 assert(obj);
732 repr = PyObject_Repr(obj);
733 if (!repr)
734 goto fail;
735 n += PyUnicode_GET_SIZE(repr);
736 /* Remember the repr and switch to the next slot */
737 *callresult++ = repr;
738 break;
739 }
740 case 'p':
741 (void) va_arg(count, int);
742 /* maximum 64-bit pointer representation:
743 * 0xffffffffffffffff
744 * so 19 characters is enough.
745 * XXX I count 18 -- what's the extra for?
746 */
747 n += 19;
748 break;
749 default:
750 /* if we stumble upon an unknown
751 formatting code, copy the rest of
752 the format string to the output
753 string. (we cannot just skip the
754 code, since there's no way to know
755 what's in the argument list) */
756 n += strlen(p);
757 goto expand;
758 }
759 } else
760 n++;
761 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000762 expand:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000763 if (abuffersize > 20) {
764 abuffer = PyObject_Malloc(abuffersize);
765 if (!abuffer) {
766 PyErr_NoMemory();
767 goto fail;
768 }
769 realbuffer = abuffer;
770 }
771 else
772 realbuffer = buffer;
773 /* step 4: fill the buffer */
774 /* Since we've analyzed how much space we need for the worst case,
775 we don't have to resize the string.
776 There can be no errors beyond this point. */
777 string = PyUnicode_FromUnicode(NULL, n);
778 if (!string)
779 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000780
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000781 s = PyUnicode_AS_UNICODE(string);
782 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000783
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000784 for (f = format; *f; f++) {
785 if (*f == '%') {
786 const char* p = f++;
787 int longflag = 0;
788 int size_tflag = 0;
789 zeropad = (*f == '0');
790 /* parse the width.precision part */
791 width = 0;
792 while (isdigit((unsigned)*f))
793 width = (width*10) + *f++ - '0';
794 precision = 0;
795 if (*f == '.') {
796 f++;
797 while (isdigit((unsigned)*f))
798 precision = (precision*10) + *f++ - '0';
799 }
800 /* handle the long flag, but only for %ld and %lu.
801 others can be added when necessary. */
802 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
803 longflag = 1;
804 ++f;
805 }
806 /* handle the size_t flag. */
807 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
808 size_tflag = 1;
809 ++f;
810 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000811
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000812 switch (*f) {
813 case 'c':
814 *s++ = va_arg(vargs, int);
815 break;
816 case 'd':
817 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
818 if (longflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, long));
820 else if (size_tflag)
821 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
822 else
823 sprintf(realbuffer, fmt, va_arg(vargs, int));
824 appendstring(realbuffer);
825 break;
826 case 'u':
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
828 if (longflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
830 else if (size_tflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
832 else
833 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
834 appendstring(realbuffer);
835 break;
836 case 'i':
837 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
838 sprintf(realbuffer, fmt, va_arg(vargs, int));
839 appendstring(realbuffer);
840 break;
841 case 'x':
842 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
843 sprintf(realbuffer, fmt, va_arg(vargs, int));
844 appendstring(realbuffer);
845 break;
846 case 's':
847 {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000848 /* unused, since we already have the result */
849 (void) va_arg(vargs, char *);
850 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
851 PyUnicode_GET_SIZE(*callresult));
852 s += PyUnicode_GET_SIZE(*callresult);
853 /* We're done with the unicode()/repr() => forget it */
854 Py_DECREF(*callresult);
855 /* switch to next unicode()/repr() result */
856 ++callresult;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000857 break;
858 }
859 case 'U':
860 {
861 PyObject *obj = va_arg(vargs, PyObject *);
862 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
863 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
864 s += size;
865 break;
866 }
867 case 'V':
868 {
869 PyObject *obj = va_arg(vargs, PyObject *);
870 const char *str = va_arg(vargs, const char *);
871 if (obj) {
872 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
873 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
874 s += size;
875 } else {
876 appendstring(str);
877 }
878 break;
879 }
880 case 'S':
881 case 'R':
882 {
883 Py_UNICODE *ucopy;
884 Py_ssize_t usize;
885 Py_ssize_t upos;
886 /* unused, since we already have the result */
887 (void) va_arg(vargs, PyObject *);
888 ucopy = PyUnicode_AS_UNICODE(*callresult);
889 usize = PyUnicode_GET_SIZE(*callresult);
890 for (upos = 0; upos<usize;)
891 *s++ = ucopy[upos++];
892 /* We're done with the unicode()/repr() => forget it */
893 Py_DECREF(*callresult);
894 /* switch to next unicode()/repr() result */
895 ++callresult;
896 break;
897 }
898 case 'p':
899 sprintf(buffer, "%p", va_arg(vargs, void*));
900 /* %p is ill-defined: ensure leading 0x. */
901 if (buffer[1] == 'X')
902 buffer[1] = 'x';
903 else if (buffer[1] != 'x') {
904 memmove(buffer+2, buffer, strlen(buffer)+1);
905 buffer[0] = '0';
906 buffer[1] = 'x';
907 }
908 appendstring(buffer);
909 break;
910 case '%':
911 *s++ = '%';
912 break;
913 default:
914 appendstring(p);
915 goto end;
916 }
917 } else
918 *s++ = *f;
919 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000920
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000921 end:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000922 if (callresults)
923 PyObject_Free(callresults);
924 if (abuffer)
925 PyObject_Free(abuffer);
926 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
927 return string;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000928 fail:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000929 if (callresults) {
930 PyObject **callresult2 = callresults;
931 while (callresult2 < callresult) {
932 Py_DECREF(*callresult2);
933 ++callresult2;
934 }
935 PyObject_Free(callresults);
936 }
937 if (abuffer)
938 PyObject_Free(abuffer);
939 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000940}
941
942#undef appendstring
943
944PyObject *
945PyUnicode_FromFormat(const char *format, ...)
946{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000947 PyObject* ret;
948 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000949
950#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000951 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000953 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000954#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000955 ret = PyUnicode_FromFormatV(format, vargs);
956 va_end(vargs);
957 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000958}
959
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000961 wchar_t *w,
962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963{
964 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000965 PyErr_BadInternalCall();
966 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000968
969 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000971 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000972
Guido van Rossumd57fd912000-03-10 22:53:23 +0000973#ifdef HAVE_USABLE_WCHAR_T
974 memcpy(w, unicode->str, size * sizeof(wchar_t));
975#else
976 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000977 register Py_UNICODE *u;
978 register Py_ssize_t i;
979 u = PyUnicode_AS_UNICODE(unicode);
980 for (i = size; i > 0; i--)
981 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982 }
983#endif
984
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000985 if (size > PyUnicode_GET_SIZE(unicode))
986 return PyUnicode_GET_SIZE(unicode);
987 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000988 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989}
990
991#endif
992
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000993PyObject *PyUnicode_FromOrdinal(int ordinal)
994{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000995 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000996
997#ifdef Py_UNICODE_WIDE
998 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000999 PyErr_SetString(PyExc_ValueError,
1000 "unichr() arg not in range(0x110000) "
1001 "(wide Python build)");
1002 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001003 }
1004#else
1005 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001006 PyErr_SetString(PyExc_ValueError,
1007 "unichr() arg not in range(0x10000) "
1008 "(narrow Python build)");
1009 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001010 }
1011#endif
1012
Hye-Shik Chang40574832004-04-06 07:24:51 +00001013 s[0] = (Py_UNICODE)ordinal;
1014 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001015}
1016
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017PyObject *PyUnicode_FromObject(register PyObject *obj)
1018{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001019 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001020 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001022 Py_INCREF(obj);
1023 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001024 }
1025 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001026 /* For a Unicode subtype that's not a Unicode object,
1027 return a true Unicode object with the same data. */
1028 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1029 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001030 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001031 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1032}
1033
1034PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001035 const char *encoding,
1036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001038 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001039 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (obj == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001043 PyErr_BadInternalCall();
1044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001046
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001047#if 0
1048 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001049 that no encodings is given and then redirect to
1050 PyObject_Unicode() which then applies the additional logic for
1051 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001052
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001053 NOTE: This API should really only be used for object which
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001054 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055
1056 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001057 if (PyUnicode_Check(obj)) {
1058 if (encoding) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001059 PyErr_SetString(PyExc_TypeError,
1060 "decoding Unicode is not supported");
1061 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001062 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001063 return PyObject_Unicode(obj);
1064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065#else
1066 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001067 PyErr_SetString(PyExc_TypeError,
1068 "decoding Unicode is not supported");
1069 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001070 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001071#endif
1072
1073 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001074 if (PyString_Check(obj)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001075 s = PyString_AS_STRING(obj);
1076 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001077 }
Christian Heimes3497f942008-05-26 12:29:14 +00001078 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001079 /* Python 2.x specific */
1080 PyErr_Format(PyExc_TypeError,
1081 "decoding bytearray is not supported");
1082 return NULL;
1083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001085 /* Overwrite the error message with something more useful in
1086 case of a TypeError. */
1087 if (PyErr_ExceptionMatches(PyExc_TypeError))
1088 PyErr_Format(PyExc_TypeError,
1089 "coercing to Unicode: need string or buffer, "
1090 "%.80s found",
1091 Py_TYPE(obj)->tp_name);
1092 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001093 }
Tim Petersced69f82003-09-16 20:30:58 +00001094
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (len == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001097 Py_INCREF(unicode_empty);
1098 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 }
Tim Petersced69f82003-09-16 20:30:58 +00001100 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001101 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001102
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 return v;
1104
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001105 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107}
1108
1109PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001110 Py_ssize_t size,
1111 const char *encoding,
1112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113{
1114 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001115
1116 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001117 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001118
1119 /* Shortcuts for common default encodings */
1120 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001122 else if (strcmp(encoding, "latin-1") == 0)
1123 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001124#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1125 else if (strcmp(encoding, "mbcs") == 0)
1126 return PyUnicode_DecodeMBCS(s, size, errors);
1127#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001128 else if (strcmp(encoding, "ascii") == 0)
1129 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130
1131 /* Decode via the codec registry */
1132 buffer = PyBuffer_FromMemory((void *)s, size);
1133 if (buffer == NULL)
1134 goto onError;
1135 unicode = PyCodec_Decode(buffer, encoding, errors);
1136 if (unicode == NULL)
1137 goto onError;
1138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001140 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001141 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 Py_DECREF(unicode);
1143 goto onError;
1144 }
1145 Py_DECREF(buffer);
1146 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001147
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 Py_XDECREF(buffer);
1150 return NULL;
1151}
1152
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001153PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1154 const char *encoding,
1155 const char *errors)
1156{
1157 PyObject *v;
1158
1159 if (!PyUnicode_Check(unicode)) {
1160 PyErr_BadArgument();
1161 goto onError;
1162 }
1163
1164 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166
1167 /* Decode via the codec registry */
1168 v = PyCodec_Decode(unicode, encoding, errors);
1169 if (v == NULL)
1170 goto onError;
1171 return v;
1172
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001173 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001174 return NULL;
1175}
1176
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001178 Py_ssize_t size,
1179 const char *encoding,
1180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181{
1182 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 unicode = PyUnicode_FromUnicode(s, size);
1185 if (unicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1188 Py_DECREF(unicode);
1189 return v;
1190}
1191
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001192PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1193 const char *encoding,
1194 const char *errors)
1195{
1196 PyObject *v;
1197
1198 if (!PyUnicode_Check(unicode)) {
1199 PyErr_BadArgument();
1200 goto onError;
1201 }
1202
1203 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001204 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001205
1206 /* Encode via the codec registry */
1207 v = PyCodec_Encode(unicode, encoding, errors);
1208 if (v == NULL)
1209 goto onError;
1210 return v;
1211
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001212 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001213 return NULL;
1214}
1215
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1217 const char *encoding,
1218 const char *errors)
1219{
1220 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
Fred Drakee4315f52000-05-09 19:53:39 +00001226
Tim Petersced69f82003-09-16 20:30:58 +00001227 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001228 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001229
1230 /* Shortcuts for common default encodings */
1231 if (errors == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001232 if (strcmp(encoding, "utf-8") == 0)
1233 return PyUnicode_AsUTF8String(unicode);
1234 else if (strcmp(encoding, "latin-1") == 0)
1235 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001236#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001237 else if (strcmp(encoding, "mbcs") == 0)
1238 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001239#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001240 else if (strcmp(encoding, "ascii") == 0)
1241 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 /* Encode via the codec registry */
1245 v = PyCodec_Encode(unicode, encoding, errors);
1246 if (v == NULL)
1247 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001248 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001250 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001251 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 Py_DECREF(v);
1253 goto onError;
1254 }
1255 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001256
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 return NULL;
1259}
1260
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001261PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001262 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001263{
1264 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1265
1266 if (v)
1267 return v;
1268 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1269 if (v && errors == NULL)
1270 ((PyUnicodeObject *)unicode)->defenc = v;
1271 return v;
1272}
1273
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1275{
1276 if (!PyUnicode_Check(unicode)) {
1277 PyErr_BadArgument();
1278 goto onError;
1279 }
1280 return PyUnicode_AS_UNICODE(unicode);
1281
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 return NULL;
1284}
1285
Martin v. Löwis18e16552006-02-15 17:27:45 +00001286Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 if (!PyUnicode_Check(unicode)) {
1289 PyErr_BadArgument();
1290 goto onError;
1291 }
1292 return PyUnicode_GET_SIZE(unicode);
1293
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 return -1;
1296}
1297
Thomas Wouters78890102000-07-22 19:25:51 +00001298const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001299{
1300 return unicode_default_encoding;
1301}
1302
1303int PyUnicode_SetDefaultEncoding(const char *encoding)
1304{
1305 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Fred Drakee4315f52000-05-09 19:53:39 +00001307 /* Make sure the encoding is valid. As side effect, this also
1308 loads the encoding into the codec registry cache. */
1309 v = _PyCodec_Lookup(encoding);
1310 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001311 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001312 Py_DECREF(v);
1313 strncpy(unicode_default_encoding,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001314 encoding,
1315 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001316 return 0;
1317
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001318 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001319 return -1;
1320}
1321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322/* error handling callback helper:
1323 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001324 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 and adjust various state variables.
1326 return 0 on success, -1 on error
1327*/
1328
1329static
1330int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001331 const char *encoding, const char *reason,
1332 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1333 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1334 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 PyObject *restuple = NULL;
1339 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001340 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1341 Py_ssize_t requiredsize;
1342 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 int res = -1;
1346
1347 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001348 *errorHandler = PyCodec_LookupError(errors);
1349 if (*errorHandler == NULL)
1350 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 }
1352
1353 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001354 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001355 encoding, input, insize, *startinpos, *endinpos, reason);
1356 if (*exceptionObject == NULL)
1357 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 }
1359 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001360 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1361 goto onError;
1362 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1363 goto onError;
1364 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1365 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 }
1367
1368 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1369 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001370 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00001372 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 }
1375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001376 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 if (newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001378 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001379 if (newpos<0 || newpos>insize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001380 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1381 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383
1384 /* need more space? (at least enough for what we
1385 have+the replacement+the rest of the string (starting
1386 at the new input position), so we won't have to check space
1387 when there are no errors in the rest of the string) */
1388 repptr = PyUnicode_AS_UNICODE(repunicode);
1389 repsize = PyUnicode_GET_SIZE(repunicode);
1390 requiredsize = *outpos + repsize + insize-newpos;
1391 if (requiredsize > outsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001392 if (requiredsize<2*outsize)
1393 requiredsize = 2*outsize;
1394 if (_PyUnicode_Resize(output, requiredsize) < 0)
1395 goto onError;
1396 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 }
1398 *endinpos = newpos;
1399 *inptr = input + newpos;
1400 Py_UNICODE_COPY(*outptr, repptr, repsize);
1401 *outptr += repsize;
1402 *outpos += repsize;
1403 /* we made it! */
1404 res = 0;
1405
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407 Py_XDECREF(restuple);
1408 return res;
1409}
1410
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411/* --- UTF-7 Codec -------------------------------------------------------- */
1412
1413/* see RFC2152 for details */
1414
Tim Petersced69f82003-09-16 20:30:58 +00001415static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416char utf7_special[128] = {
1417 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1418 encoded:
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001419 0 - not special
1420 1 - special
1421 2 - whitespace (optional)
1422 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1424 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1425 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1429 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1431
1432};
1433
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001434/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1435 warnings about the comparison always being false; since
1436 utf7_special[0] is 1, we can safely make that one comparison
1437 true */
1438
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001439#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001440 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001441 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 (encodeO && (utf7_special[(c)] == 3)))
1443
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001444#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001445 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001446#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001447 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001448#define UB64(c) \
1449 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001452#define ENCODE(out, ch, bits) \
1453 while (bits >= 6) { \
1454 *out++ = B64(ch >> (bits-6)); \
1455 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 }
1457
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001458#define DECODE(out, ch, bits, surrogate) \
1459 while (bits >= 16) { \
1460 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1461 bits -= 16; \
1462 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001463 /* We have already generated an error for the high surrogate \
1464 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001465 surrogate = 0; \
1466 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468 it in a 16-bit character */ \
1469 surrogate = 1; \
1470 errmsg = "code pairs are not supported"; \
1471 goto utf7Error; \
1472 } else { \
1473 *out++ = outCh; \
1474 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001478 Py_ssize_t size,
1479 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001481 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1482}
1483
1484PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001485 Py_ssize_t size,
1486 const char *errors,
1487 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001490 Py_ssize_t startinpos;
1491 Py_ssize_t endinpos;
1492 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 const char *e;
1494 PyUnicodeObject *unicode;
1495 Py_UNICODE *p;
1496 const char *errmsg = "";
1497 int inShift = 0;
1498 unsigned int bitsleft = 0;
1499 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 int surrogate = 0;
1501 PyObject *errorHandler = NULL;
1502 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503
1504 unicode = _PyUnicode_New(size);
1505 if (!unicode)
1506 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001507 if (size == 0) {
1508 if (consumed)
1509 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001511 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 p = unicode->str;
1514 e = s + size;
1515
1516 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 Py_UNICODE ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001518 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001519 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520
1521 if (inShift) {
1522 if ((ch == '-') || !B64CHAR(ch)) {
1523 inShift = 0;
1524 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001525
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1527 if (bitsleft >= 6) {
1528 /* The shift sequence has a partial character in it. If
1529 bitsleft < 6 then we could just classify it as padding
1530 but that is not the case here */
1531
1532 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001533 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 }
1535 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001536 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 here so indicate the potential of a misencoded character. */
1538
1539 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1540 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1541 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 }
1544
1545 if (ch == '-') {
1546 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001547 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 inShift = 1;
1549 }
1550 } else if (SPECIAL(ch,0,0)) {
1551 errmsg = "unexpected special character";
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001552 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 } else {
1554 *p++ = ch;
1555 }
1556 } else {
1557 charsleft = (charsleft << 6) | UB64(ch);
1558 bitsleft += 6;
1559 s++;
1560 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1561 }
1562 }
1563 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 s++;
1566 if (s < e && *s == '-') {
1567 s++;
1568 *p++ = '+';
1569 } else
1570 {
1571 inShift = 1;
1572 bitsleft = 0;
1573 }
1574 }
1575 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001576 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 errmsg = "unexpected special character";
1578 s++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001579 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 }
1581 else {
1582 *p++ = ch;
1583 s++;
1584 }
1585 continue;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001586 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001587 outpos = p-PyUnicode_AS_UNICODE(unicode);
1588 endinpos = s-starts;
1589 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001590 errors, &errorHandler,
1591 "utf7", errmsg,
1592 starts, size, &startinpos, &endinpos, &exc, &s,
1593 &unicode, &outpos, &p))
1594 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 }
1596
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001597 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 outpos = p-PyUnicode_AS_UNICODE(unicode);
1599 endinpos = size;
1600 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001601 errors, &errorHandler,
1602 "utf7", "unterminated shift sequence",
1603 starts, size, &startinpos, &endinpos, &exc, &s,
1604 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 if (s < e)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001607 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001609 if (consumed) {
1610 if(inShift)
1611 *consumed = startinpos;
1612 else
1613 *consumed = s-starts;
1614 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001616 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 goto onError;
1618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 Py_XDECREF(errorHandler);
1620 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 return (PyObject *)unicode;
1622
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001623 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 Py_XDECREF(errorHandler);
1625 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 Py_DECREF(unicode);
1627 return NULL;
1628}
1629
1630
1631PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001632 Py_ssize_t size,
1633 int encodeSetO,
1634 int encodeWhiteSpace,
1635 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636{
1637 PyObject *v;
1638 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 unsigned int bitsleft = 0;
1643 unsigned long charsleft = 0;
1644 char * out;
1645 char * start;
1646
Neal Norwitze7d8be82008-07-31 17:17:14 +00001647 if (cbAllocated / 5 != size)
1648 return PyErr_NoMemory();
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 if (size == 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001651 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001653 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 if (v == NULL)
1655 return NULL;
1656
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001657 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 for (;i < size; ++i) {
1659 Py_UNICODE ch = s[i];
1660
1661 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001662 if (ch == '+') {
1663 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 *out++ = '-';
1665 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1666 charsleft = ch;
1667 bitsleft = 16;
1668 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001671 } else {
1672 *out++ = (char) ch;
1673 }
1674 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1676 *out++ = B64(charsleft << (6-bitsleft));
1677 charsleft = 0;
1678 bitsleft = 0;
1679 /* Characters not in the BASE64 set implicitly unshift the sequence
1680 so no '-' is required, except if the character is itself a '-' */
1681 if (B64CHAR(ch) || ch == '-') {
1682 *out++ = '-';
1683 }
1684 inShift = 0;
1685 *out++ = (char) ch;
1686 } else {
1687 bitsleft += 16;
1688 charsleft = (charsleft << 16) | ch;
1689 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1690
Jesus Cea585ad8a2009-07-02 15:37:21 +00001691 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001692 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 or '-' then the shift sequence will be terminated implicitly and we
1694 don't have to insert a '-'. */
1695
1696 if (bitsleft == 0) {
1697 if (i + 1 < size) {
1698 Py_UNICODE ch2 = s[i+1];
1699
1700 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001701
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 } else if (B64CHAR(ch2) || ch2 == '-') {
1703 *out++ = '-';
1704 inShift = 0;
1705 } else {
1706 inShift = 0;
1707 }
1708
1709 }
1710 else {
1711 *out++ = '-';
1712 inShift = 0;
1713 }
1714 }
Tim Petersced69f82003-09-16 20:30:58 +00001715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001717 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 if (bitsleft) {
1719 *out++= B64(charsleft << (6-bitsleft) );
1720 *out++ = '-';
1721 }
1722
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001723 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 return v;
1725}
1726
1727#undef SPECIAL
1728#undef B64
1729#undef B64CHAR
1730#undef UB64
1731#undef ENCODE
1732#undef DECODE
1733
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734/* --- UTF-8 Codec -------------------------------------------------------- */
1735
Tim Petersced69f82003-09-16 20:30:58 +00001736static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737char utf8_code_length[256] = {
1738 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1739 illegal prefix. see RFC 2279 for details */
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1752 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1753 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1754 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1755 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1756};
1757
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001759 Py_ssize_t size,
1760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761{
Walter Dörwald69652032004-09-07 20:24:22 +00001762 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1763}
1764
1765PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001766 Py_ssize_t size,
1767 const char *errors,
1768 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t startinpos;
1773 Py_ssize_t endinpos;
1774 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 const char *e;
1776 PyUnicodeObject *unicode;
1777 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 PyObject *errorHandler = NULL;
1780 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781
1782 /* Note: size will always be longer than the resulting Unicode
1783 character count */
1784 unicode = _PyUnicode_New(size);
1785 if (!unicode)
1786 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001787 if (size == 0) {
1788 if (consumed)
1789 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792
1793 /* Unpack UTF-8 encoded data */
1794 p = unicode->str;
1795 e = s + size;
1796
1797 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001798 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
1800 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001801 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 s++;
1803 continue;
1804 }
1805
1806 n = utf8_code_length[ch];
1807
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 if (s + n > e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001809 if (consumed)
1810 break;
1811 else {
1812 errmsg = "unexpected end of data";
1813 startinpos = s-starts;
1814 endinpos = size;
1815 goto utf8Error;
1816 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
1819 switch (n) {
1820
1821 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 errmsg = "unexpected code byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001823 startinpos = s-starts;
1824 endinpos = startinpos+1;
1825 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
1827 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001828 errmsg = "internal error";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+1;
1831 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832
1833 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 if ((s[1] & 0xc0) != 0x80) {
1835 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001836 startinpos = s-starts;
1837 endinpos = startinpos+2;
1838 goto utf8Error;
1839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 if (ch < 0x80) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001842 startinpos = s-starts;
1843 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001845 goto utf8Error;
1846 }
1847 else
1848 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 break;
1850
1851 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001852 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 (s[2] & 0xc0) != 0x80) {
1854 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001855 startinpos = s-starts;
1856 endinpos = startinpos+3;
1857 goto utf8Error;
1858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001860 if (ch < 0x0800) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001861 /* Note: UTF-8 encodings of surrogates are considered
1862 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001863
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001864 XXX For wide builds (UCS-4) we should probably try
1865 to recombine the surrogates into a single code
1866 unit.
1867 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001868 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001869 startinpos = s-starts;
1870 endinpos = startinpos+3;
1871 goto utf8Error;
1872 }
1873 else
1874 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 break;
1876
1877 case 4:
1878 if ((s[1] & 0xc0) != 0x80 ||
1879 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 (s[3] & 0xc0) != 0x80) {
1881 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001882 startinpos = s-starts;
1883 endinpos = startinpos+4;
1884 goto utf8Error;
1885 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001886 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001887 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001888 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001889 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001890 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001891 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001892 UTF-16 */
1893 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001894 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001895 startinpos = s-starts;
1896 endinpos = startinpos+4;
1897 goto utf8Error;
1898 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001899#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001900 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001903
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001904 /* translate from 10000..10FFFF to 0..FFFF */
1905 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001906
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001907 /* high surrogate = top 10 bits added to D800 */
1908 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001910 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 break;
1914
1915 default:
1916 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001917 errmsg = "unsupported Unicode code range";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001918 startinpos = s-starts;
1919 endinpos = startinpos+n;
1920 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 }
1922 s += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001923 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001924
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001925 utf8Error:
1926 outpos = p-PyUnicode_AS_UNICODE(unicode);
1927 if (unicode_decode_call_errorhandler(
1928 errors, &errorHandler,
1929 "utf8", errmsg,
1930 starts, size, &startinpos, &endinpos, &exc, &s,
1931 &unicode, &outpos, &p))
1932 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Walter Dörwald69652032004-09-07 20:24:22 +00001934 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001935 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936
1937 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001938 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 goto onError;
1940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 Py_XDECREF(errorHandler);
1942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 return (PyObject *)unicode;
1944
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001945 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 Py_DECREF(unicode);
1949 return NULL;
1950}
1951
Tim Peters602f7402002-04-27 18:03:26 +00001952/* Allocation strategy: if the string is short, convert into a stack buffer
1953 and allocate exactly as much space needed at the end. Else allocate the
1954 maximum possible needed (4 result bytes per Unicode character), and return
1955 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001956*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001957PyObject *
1958PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001959 Py_ssize_t size,
1960 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961{
Tim Peters602f7402002-04-27 18:03:26 +00001962#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001963
Martin v. Löwis18e16552006-02-15 17:27:45 +00001964 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001965 PyObject *v; /* result string object */
1966 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001968 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001969 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001970
Tim Peters602f7402002-04-27 18:03:26 +00001971 assert(s != NULL);
1972 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
Tim Peters602f7402002-04-27 18:03:26 +00001974 if (size <= MAX_SHORT_UNICHARS) {
1975 /* Write into the stack buffer; nallocated can't overflow.
1976 * At the end, we'll allocate exactly as much heap space as it
1977 * turns out we need.
1978 */
1979 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1980 v = NULL; /* will allocate after we're done */
1981 p = stackbuf;
1982 }
1983 else {
1984 /* Overallocate on the heap, and give the excess back at the end. */
1985 nallocated = size * 4;
1986 if (nallocated / 4 != size) /* overflow! */
1987 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001988 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001989 if (v == NULL)
1990 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001991 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001992 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001993
Tim Peters602f7402002-04-27 18:03:26 +00001994 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001995 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001996
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001997 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001998 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002000
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002002 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002003 *p++ = (char)(0xc0 | (ch >> 6));
2004 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002005 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002006 else {
Tim Peters602f7402002-04-27 18:03:26 +00002007 /* Encode UCS2 Unicode ordinals */
2008 if (ch < 0x10000) {
2009 /* Special case: check for high surrogate */
2010 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2011 Py_UCS4 ch2 = s[i];
2012 /* Check for low surrogate and combine the two to
2013 form a UCS4 value */
2014 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002015 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002016 i++;
2017 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002018 }
Tim Peters602f7402002-04-27 18:03:26 +00002019 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002022 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2023 *p++ = (char)(0x80 | (ch & 0x3f));
2024 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002025 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002026 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002027 /* Encode UCS4 Unicode ordinals */
2028 *p++ = (char)(0xf0 | (ch >> 18));
2029 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2030 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2031 *p++ = (char)(0x80 | (ch & 0x3f));
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002034
Tim Peters602f7402002-04-27 18:03:26 +00002035 if (v == NULL) {
2036 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002037 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002038 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002039 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002040 }
2041 else {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002042 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002043 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002044 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002045 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002048
Tim Peters602f7402002-04-27 18:03:26 +00002049#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2053{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 if (!PyUnicode_Check(unicode)) {
2055 PyErr_BadArgument();
2056 return NULL;
2057 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002058 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002059 PyUnicode_GET_SIZE(unicode),
2060 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061}
2062
Walter Dörwald6e390802007-08-17 16:41:28 +00002063/* --- UTF-32 Codec ------------------------------------------------------- */
2064
2065PyObject *
2066PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002067 Py_ssize_t size,
2068 const char *errors,
2069 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002070{
2071 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2072}
2073
2074PyObject *
2075PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002076 Py_ssize_t size,
2077 const char *errors,
2078 int *byteorder,
2079 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002080{
2081 const char *starts = s;
2082 Py_ssize_t startinpos;
2083 Py_ssize_t endinpos;
2084 Py_ssize_t outpos;
2085 PyUnicodeObject *unicode;
2086 Py_UNICODE *p;
2087#ifndef Py_UNICODE_WIDE
2088 int i, pairs;
2089#else
2090 const int pairs = 0;
2091#endif
2092 const unsigned char *q, *e;
2093 int bo = 0; /* assume native ordering by default */
2094 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002095 /* Offsets from q for retrieving bytes in the right order. */
2096#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2097 int iorder[] = {0, 1, 2, 3};
2098#else
2099 int iorder[] = {3, 2, 1, 0};
2100#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002101 PyObject *errorHandler = NULL;
2102 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002103 /* On narrow builds we split characters outside the BMP into two
2104 codepoints => count how much extra space we need. */
2105#ifndef Py_UNICODE_WIDE
2106 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002107 if (((Py_UCS4 *)s)[i] >= 0x10000)
2108 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002109#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002110
2111 /* This might be one to much, because of a BOM */
2112 unicode = _PyUnicode_New((size+3)/4+pairs);
2113 if (!unicode)
2114 return NULL;
2115 if (size == 0)
2116 return (PyObject *)unicode;
2117
2118 /* Unpack UTF-32 encoded data */
2119 p = unicode->str;
2120 q = (unsigned char *)s;
2121 e = q + size;
2122
2123 if (byteorder)
2124 bo = *byteorder;
2125
2126 /* Check for BOM marks (U+FEFF) in the input and adjust current
2127 byte order setting accordingly. In native mode, the leading BOM
2128 mark is skipped, in all other modes, it is copied to the output
2129 stream as-is (giving a ZWNBSP character). */
2130 if (bo == 0) {
2131 if (size >= 4) {
2132 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002133 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002134#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002135 if (bom == 0x0000FEFF) {
2136 q += 4;
2137 bo = -1;
2138 }
2139 else if (bom == 0xFFFE0000) {
2140 q += 4;
2141 bo = 1;
2142 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002143#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002144 if (bom == 0x0000FEFF) {
2145 q += 4;
2146 bo = 1;
2147 }
2148 else if (bom == 0xFFFE0000) {
2149 q += 4;
2150 bo = -1;
2151 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002152#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002153 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002154 }
2155
2156 if (bo == -1) {
2157 /* force LE */
2158 iorder[0] = 0;
2159 iorder[1] = 1;
2160 iorder[2] = 2;
2161 iorder[3] = 3;
2162 }
2163 else if (bo == 1) {
2164 /* force BE */
2165 iorder[0] = 3;
2166 iorder[1] = 2;
2167 iorder[2] = 1;
2168 iorder[3] = 0;
2169 }
2170
2171 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002172 Py_UCS4 ch;
2173 /* remaining bytes at the end? (size should be divisible by 4) */
2174 if (e-q<4) {
2175 if (consumed)
2176 break;
2177 errmsg = "truncated data";
2178 startinpos = ((const char *)q)-starts;
2179 endinpos = ((const char *)e)-starts;
2180 goto utf32Error;
2181 /* The remaining input chars are ignored if the callback
2182 chooses to skip the input */
2183 }
2184 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2185 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002186
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002187 if (ch >= 0x110000)
2188 {
2189 errmsg = "codepoint not in range(0x110000)";
2190 startinpos = ((const char *)q)-starts;
2191 endinpos = startinpos+4;
2192 goto utf32Error;
2193 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002194#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002195 if (ch >= 0x10000)
2196 {
2197 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2198 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2199 }
2200 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002201#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002202 *p++ = ch;
2203 q += 4;
2204 continue;
2205 utf32Error:
2206 outpos = p-PyUnicode_AS_UNICODE(unicode);
2207 if (unicode_decode_call_errorhandler(
2208 errors, &errorHandler,
2209 "utf32", errmsg,
Georg Brandlf7a09be2009-09-17 11:33:31 +00002210 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002211 &unicode, &outpos, &p))
2212 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002213 }
2214
2215 if (byteorder)
2216 *byteorder = bo;
2217
2218 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002219 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002220
2221 /* Adjust length */
2222 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2223 goto onError;
2224
2225 Py_XDECREF(errorHandler);
2226 Py_XDECREF(exc);
2227 return (PyObject *)unicode;
2228
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002229 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002230 Py_DECREF(unicode);
2231 Py_XDECREF(errorHandler);
2232 Py_XDECREF(exc);
2233 return NULL;
2234}
2235
2236PyObject *
2237PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002238 Py_ssize_t size,
2239 const char *errors,
2240 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002241{
2242 PyObject *v;
2243 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002244 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002245#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002246 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002247#else
2248 const int pairs = 0;
2249#endif
2250 /* Offsets from p for storing byte pairs in the right order. */
2251#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2252 int iorder[] = {0, 1, 2, 3};
2253#else
2254 int iorder[] = {3, 2, 1, 0};
2255#endif
2256
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002257#define STORECHAR(CH) \
2258 do { \
2259 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2260 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2261 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2262 p[iorder[0]] = (CH) & 0xff; \
2263 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002264 } while(0)
2265
2266 /* In narrow builds we can output surrogate pairs as one codepoint,
2267 so we need less space. */
2268#ifndef Py_UNICODE_WIDE
2269 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002270 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2271 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2272 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002273#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002274 nsize = (size - pairs + (byteorder == 0));
2275 bytesize = nsize * 4;
2276 if (bytesize / 4 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002277 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002278 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002279 if (v == NULL)
2280 return NULL;
2281
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002282 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002283 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002284 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002285 if (size == 0)
2286 return v;
2287
2288 if (byteorder == -1) {
2289 /* force LE */
2290 iorder[0] = 0;
2291 iorder[1] = 1;
2292 iorder[2] = 2;
2293 iorder[3] = 3;
2294 }
2295 else if (byteorder == 1) {
2296 /* force BE */
2297 iorder[0] = 3;
2298 iorder[1] = 2;
2299 iorder[2] = 1;
2300 iorder[3] = 0;
2301 }
2302
2303 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002304 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002305#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002306 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2307 Py_UCS4 ch2 = *s;
2308 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2309 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2310 s++;
2311 size--;
2312 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002313 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002314#endif
2315 STORECHAR(ch);
2316 }
2317 return v;
2318#undef STORECHAR
2319}
2320
2321PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2322{
2323 if (!PyUnicode_Check(unicode)) {
2324 PyErr_BadArgument();
2325 return NULL;
2326 }
2327 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002328 PyUnicode_GET_SIZE(unicode),
2329 NULL,
2330 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002331}
2332
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333/* --- UTF-16 Codec ------------------------------------------------------- */
2334
Tim Peters772747b2001-08-09 22:21:55 +00002335PyObject *
2336PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002337 Py_ssize_t size,
2338 const char *errors,
2339 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340{
Walter Dörwald69652032004-09-07 20:24:22 +00002341 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2342}
2343
2344PyObject *
2345PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002346 Py_ssize_t size,
2347 const char *errors,
2348 int *byteorder,
2349 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t startinpos;
2353 Py_ssize_t endinpos;
2354 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 PyUnicodeObject *unicode;
2356 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002357 const unsigned char *q, *e;
2358 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002359 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002360 /* Offsets from q for retrieving byte pairs in the right order. */
2361#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2362 int ihi = 1, ilo = 0;
2363#else
2364 int ihi = 0, ilo = 1;
2365#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002366 PyObject *errorHandler = NULL;
2367 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368
2369 /* Note: size will always be longer than the resulting Unicode
2370 character count */
2371 unicode = _PyUnicode_New(size);
2372 if (!unicode)
2373 return NULL;
2374 if (size == 0)
2375 return (PyObject *)unicode;
2376
2377 /* Unpack UTF-16 encoded data */
2378 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002379 q = (unsigned char *)s;
2380 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002383 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002385 /* Check for BOM marks (U+FEFF) in the input and adjust current
2386 byte order setting accordingly. In native mode, the leading BOM
2387 mark is skipped, in all other modes, it is copied to the output
2388 stream as-is (giving a ZWNBSP character). */
2389 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002390 if (size >= 2) {
2391 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002392#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002393 if (bom == 0xFEFF) {
2394 q += 2;
2395 bo = -1;
2396 }
2397 else if (bom == 0xFFFE) {
2398 q += 2;
2399 bo = 1;
2400 }
Tim Petersced69f82003-09-16 20:30:58 +00002401#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002402 if (bom == 0xFEFF) {
2403 q += 2;
2404 bo = 1;
2405 }
2406 else if (bom == 0xFFFE) {
2407 q += 2;
2408 bo = -1;
2409 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002410#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002411 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413
Tim Peters772747b2001-08-09 22:21:55 +00002414 if (bo == -1) {
2415 /* force LE */
2416 ihi = 1;
2417 ilo = 0;
2418 }
2419 else if (bo == 1) {
2420 /* force BE */
2421 ihi = 0;
2422 ilo = 1;
2423 }
2424
2425 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002426 Py_UNICODE ch;
2427 /* remaining bytes at the end? (size should be even) */
2428 if (e-q<2) {
2429 if (consumed)
2430 break;
2431 errmsg = "truncated data";
2432 startinpos = ((const char *)q)-starts;
2433 endinpos = ((const char *)e)-starts;
2434 goto utf16Error;
2435 /* The remaining input chars are ignored if the callback
2436 chooses to skip the input */
2437 }
2438 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002439
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002440 q += 2;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002441
2442 if (ch < 0xD800 || ch > 0xDFFF) {
2443 *p++ = ch;
2444 continue;
2445 }
2446
2447 /* UTF-16 code pair: */
2448 if (q >= e) {
2449 errmsg = "unexpected end of data";
2450 startinpos = (((const char *)q)-2)-starts;
2451 endinpos = ((const char *)e)-starts;
2452 goto utf16Error;
2453 }
2454 if (0xD800 <= ch && ch <= 0xDBFF) {
2455 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2456 q += 2;
2457 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002458#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002459 *p++ = ch;
2460 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002461#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002462 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002463#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002464 continue;
2465 }
2466 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002467 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002468 startinpos = (((const char *)q)-4)-starts;
2469 endinpos = startinpos+2;
2470 goto utf16Error;
2471 }
2472
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002473 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002474 errmsg = "illegal encoding";
2475 startinpos = (((const char *)q)-2)-starts;
2476 endinpos = startinpos+2;
2477 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002478
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002479 utf16Error:
2480 outpos = p-PyUnicode_AS_UNICODE(unicode);
2481 if (unicode_decode_call_errorhandler(
2482 errors, &errorHandler,
2483 "utf16", errmsg,
2484 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2485 &unicode, &outpos, &p))
2486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 }
2488
2489 if (byteorder)
2490 *byteorder = bo;
2491
Walter Dörwald69652032004-09-07 20:24:22 +00002492 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002493 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002496 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 goto onError;
2498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return (PyObject *)unicode;
2502
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002503 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002505 Py_XDECREF(errorHandler);
2506 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 return NULL;
2508}
2509
Tim Peters772747b2001-08-09 22:21:55 +00002510PyObject *
2511PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002512 Py_ssize_t size,
2513 const char *errors,
2514 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515{
2516 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002517 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002518 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002519#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002520 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002521#else
2522 const int pairs = 0;
2523#endif
Tim Peters772747b2001-08-09 22:21:55 +00002524 /* Offsets from p for storing byte pairs in the right order. */
2525#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2526 int ihi = 1, ilo = 0;
2527#else
2528 int ihi = 0, ilo = 1;
2529#endif
2530
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002531#define STORECHAR(CH) \
2532 do { \
2533 p[ihi] = ((CH) >> 8) & 0xff; \
2534 p[ilo] = (CH) & 0xff; \
2535 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002536 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002538#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002539 for (i = pairs = 0; i < size; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002540 if (s[i] >= 0x10000)
2541 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002542#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002543 /* 2 * (size + pairs + (byteorder == 0)) */
2544 if (size > PY_SSIZE_T_MAX ||
2545 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002546 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002547 nsize = size + pairs + (byteorder == 0);
2548 bytesize = nsize * 2;
2549 if (bytesize / 2 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002550 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002551 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 if (v == NULL)
2553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002555 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002557 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002558 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002559 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002560
2561 if (byteorder == -1) {
2562 /* force LE */
2563 ihi = 1;
2564 ilo = 0;
2565 }
2566 else if (byteorder == 1) {
2567 /* force BE */
2568 ihi = 0;
2569 ilo = 1;
2570 }
2571
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002573 Py_UNICODE ch = *s++;
2574 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002575#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002576 if (ch >= 0x10000) {
2577 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2578 ch = 0xD800 | ((ch-0x10000) >> 10);
2579 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002580#endif
Tim Peters772747b2001-08-09 22:21:55 +00002581 STORECHAR(ch);
2582 if (ch2)
2583 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002586#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587}
2588
2589PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2590{
2591 if (!PyUnicode_Check(unicode)) {
2592 PyErr_BadArgument();
2593 return NULL;
2594 }
2595 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002596 PyUnicode_GET_SIZE(unicode),
2597 NULL,
2598 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599}
2600
2601/* --- Unicode Escape Codec ----------------------------------------------- */
2602
Fredrik Lundh06d12682001-01-24 07:59:11 +00002603static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002604
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002606 Py_ssize_t size,
2607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002609 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002610 Py_ssize_t startinpos;
2611 Py_ssize_t endinpos;
2612 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002613 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002617 char* message;
2618 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 PyObject *errorHandler = NULL;
2620 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002621
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 /* Escaped strings will always be longer than the resulting
2623 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 length after conversion to the true value.
2625 (but if the error callback returns a long replacement string
2626 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 v = _PyUnicode_New(size);
2628 if (v == NULL)
2629 goto onError;
2630 if (size == 0)
2631 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002635
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 while (s < end) {
2637 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002638 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640
2641 /* Non-escape characters are interpreted as Unicode ordinals */
2642 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002643 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 continue;
2645 }
2646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 /* \ - Escapes */
2649 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002650 c = *s++;
2651 if (s > end)
2652 c = '\0'; /* Invalid after \ */
2653 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002655 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 case '\n': break;
2657 case '\\': *p++ = '\\'; break;
2658 case '\'': *p++ = '\''; break;
2659 case '\"': *p++ = '\"'; break;
2660 case 'b': *p++ = '\b'; break;
2661 case 'f': *p++ = '\014'; break; /* FF */
2662 case 't': *p++ = '\t'; break;
2663 case 'n': *p++ = '\n'; break;
2664 case 'r': *p++ = '\r'; break;
2665 case 'v': *p++ = '\013'; break; /* VT */
2666 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2667
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002668 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 case '0': case '1': case '2': case '3':
2670 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002671 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002672 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002673 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002674 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002675 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002677 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 break;
2679
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002680 /* hex escapes */
2681 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002683 digits = 2;
2684 message = "truncated \\xXX escape";
2685 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002687 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 digits = 4;
2690 message = "truncated \\uXXXX escape";
2691 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002693 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002694 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002695 digits = 8;
2696 message = "truncated \\UXXXXXXXX escape";
2697 hexescape:
2698 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002699 outpos = p-PyUnicode_AS_UNICODE(v);
2700 if (s+digits>end) {
2701 endinpos = size;
2702 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002703 errors, &errorHandler,
2704 "unicodeescape", "end of string in escape sequence",
2705 starts, size, &startinpos, &endinpos, &exc, &s,
2706 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 goto onError;
2708 goto nextByte;
2709 }
2710 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002712 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 endinpos = (s+i+1)-starts;
2714 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002715 errors, &errorHandler,
2716 "unicodeescape", message,
2717 starts, size, &startinpos, &endinpos, &exc, &s,
2718 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002719 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002721 }
2722 chr = (chr<<4) & ~0xF;
2723 if (c >= '0' && c <= '9')
2724 chr += c - '0';
2725 else if (c >= 'a' && c <= 'f')
2726 chr += 10 + c - 'a';
2727 else
2728 chr += 10 + c - 'A';
2729 }
2730 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002731 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 /* _decoding_error will have already written into the
2733 target buffer. */
2734 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002735 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002736 /* when we get here, chr is a 32-bit unicode character */
2737 if (chr <= 0xffff)
2738 /* UCS-2 character */
2739 *p++ = (Py_UNICODE) chr;
2740 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002741 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002742 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002743#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002744 *p++ = chr;
2745#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002746 chr -= 0x10000L;
2747 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002748 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002750 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 endinpos = s-starts;
2752 outpos = p-PyUnicode_AS_UNICODE(v);
2753 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002754 errors, &errorHandler,
2755 "unicodeescape", "illegal Unicode character",
2756 starts, size, &startinpos, &endinpos, &exc, &s,
2757 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002758 goto onError;
2759 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002760 break;
2761
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002762 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763 case 'N':
2764 message = "malformed \\N character escape";
2765 if (ucnhash_CAPI == NULL) {
2766 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002767 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002768 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 if (m == NULL)
2770 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002771 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002773 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002774 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002775 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002776 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 if (ucnhash_CAPI == NULL)
2778 goto ucnhashError;
2779 }
2780 if (*s == '{') {
2781 const char *start = s+1;
2782 /* look for the closing brace */
2783 while (*s != '}' && s < end)
2784 s++;
2785 if (s > start && s < end && *s == '}') {
2786 /* found a name. look it up in the unicode database */
2787 message = "unknown Unicode character name";
2788 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002789 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002790 goto store;
2791 }
2792 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 endinpos = s-starts;
2794 outpos = p-PyUnicode_AS_UNICODE(v);
2795 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002796 errors, &errorHandler,
2797 "unicodeescape", message,
2798 starts, size, &startinpos, &endinpos, &exc, &s,
2799 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002800 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002801 break;
2802
2803 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002804 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 message = "\\ at end of string";
2806 s--;
2807 endinpos = s-starts;
2808 outpos = p-PyUnicode_AS_UNICODE(v);
2809 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002810 errors, &errorHandler,
2811 "unicodeescape", message,
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002814 goto onError;
2815 }
2816 else {
2817 *p++ = '\\';
2818 *p++ = (unsigned char)s[-1];
2819 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002822 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002825 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002827 Py_XDECREF(errorHandler);
2828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002830
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002831 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002832 PyErr_SetString(
2833 PyExc_UnicodeError,
2834 "\\N escapes not supported (can't load unicodedata module)"
2835 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002836 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 Py_XDECREF(errorHandler);
2838 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002839 return NULL;
2840
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002841 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 Py_XDECREF(errorHandler);
2844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 return NULL;
2846}
2847
2848/* Return a Unicode-Escape string version of the Unicode object.
2849
2850 If quotes is true, the string is enclosed in u"" or u'' quotes as
2851 appropriate.
2852
2853*/
2854
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002855Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002856 Py_ssize_t size,
2857 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002858{
2859 /* like wcschr, but doesn't stop at NULL characters */
2860
2861 while (size-- > 0) {
2862 if (*s == ch)
2863 return s;
2864 s++;
2865 }
2866
2867 return NULL;
2868}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002869
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870static
2871PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002872 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 int quotes)
2874{
2875 PyObject *repr;
2876 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002878 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002879#ifdef Py_UNICODE_WIDE
2880 const Py_ssize_t expandsize = 10;
2881#else
2882 const Py_ssize_t expandsize = 6;
2883#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884
Neal Norwitz17753ec2006-08-21 22:21:19 +00002885 /* XXX(nnorwitz): rather than over-allocating, it would be
2886 better to choose a different scheme. Perhaps scan the
2887 first N-chars of the string and allocate based on that size.
2888 */
2889 /* Initial allocation is based on the longest-possible unichr
2890 escape.
2891
2892 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2893 unichr, so in this case it's the longest unichr escape. In
2894 narrow (UTF-16) builds this is five chars per source unichr
2895 since there are two unichrs in the surrogate pair, so in narrow
2896 (UTF-16) builds it's not the longest unichr escape.
2897
2898 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2899 so in the narrow (UTF-16) build case it's the longest unichr
2900 escape.
2901 */
2902
Neal Norwitze7d8be82008-07-31 17:17:14 +00002903 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002904 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002905
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002906 repr = PyString_FromStringAndSize(NULL,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002907 2
2908 + expandsize*size
2909 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 if (repr == NULL)
2911 return NULL;
2912
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002913 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914
2915 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002917 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 !findchar(s, size, '"')) ? '"' : '\'';
2919 }
2920 while (size-- > 0) {
2921 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002922
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002923 /* Escape quotes and backslashes */
2924 if ((quotes &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002925 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 *p++ = '\\';
2927 *p++ = (char) ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002928 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002929 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002930
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002931#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002932 /* Map 21-bit characters to '\U00xxxxxx' */
2933 else if (ch >= 0x10000) {
2934 *p++ = '\\';
2935 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002936 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2937 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2938 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2939 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2940 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2941 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2942 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002943 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002944 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002945 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002946#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002947 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2948 else if (ch >= 0xD800 && ch < 0xDC00) {
2949 Py_UNICODE ch2;
2950 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002951
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002952 ch2 = *s++;
2953 size--;
2954 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2955 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2956 *p++ = '\\';
2957 *p++ = 'U';
2958 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2959 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2960 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2961 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2962 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2963 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2964 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2965 *p++ = hexdigit[ucs & 0x0000000F];
2966 continue;
2967 }
2968 /* Fall through: isolated surrogates are copied as-is */
2969 s--;
2970 size++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002971 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002972#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002973
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002975 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 *p++ = '\\';
2977 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002978 *p++ = hexdigit[(ch >> 12) & 0x000F];
2979 *p++ = hexdigit[(ch >> 8) & 0x000F];
2980 *p++ = hexdigit[(ch >> 4) & 0x000F];
2981 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002983
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002984 /* Map special whitespace to '\t', \n', '\r' */
2985 else if (ch == '\t') {
2986 *p++ = '\\';
2987 *p++ = 't';
2988 }
2989 else if (ch == '\n') {
2990 *p++ = '\\';
2991 *p++ = 'n';
2992 }
2993 else if (ch == '\r') {
2994 *p++ = '\\';
2995 *p++ = 'r';
2996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002998 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002999 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003001 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003002 *p++ = hexdigit[(ch >> 4) & 0x000F];
3003 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003004 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003005
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 /* Copy everything else as-is */
3007 else
3008 *p++ = (char) ch;
3009 }
3010 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003011 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012
3013 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003014 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 return repr;
3016}
3017
3018PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003019 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020{
3021 return unicodeescape_string(s, size, 0);
3022}
3023
3024PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3025{
3026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 return NULL;
3029 }
3030 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003031 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032}
3033
3034/* --- Raw Unicode Escape Codec ------------------------------------------- */
3035
3036PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003037 Py_ssize_t size,
3038 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003041 Py_ssize_t startinpos;
3042 Py_ssize_t endinpos;
3043 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 const char *end;
3047 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 PyObject *errorHandler = NULL;
3049 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003050
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 /* Escaped strings will always be longer than the resulting
3052 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 length after conversion to the true value. (But decoding error
3054 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 v = _PyUnicode_New(size);
3056 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003057 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003059 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 end = s + size;
3062 while (s < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003063 unsigned char c;
3064 Py_UCS4 x;
3065 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003066 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003068 /* Non-escape characters are interpreted as Unicode ordinals */
3069 if (*s != '\\') {
3070 *p++ = (unsigned char)*s++;
3071 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003072 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003073 startinpos = s-starts;
3074
3075 /* \u-escapes are only interpreted iff the number of leading
3076 backslashes if odd */
3077 bs = s;
3078 for (;s < end;) {
3079 if (*s != '\\')
3080 break;
3081 *p++ = (unsigned char)*s++;
3082 }
3083 if (((s - bs) & 1) == 0 ||
3084 s >= end ||
3085 (*s != 'u' && *s != 'U')) {
3086 continue;
3087 }
3088 p--;
3089 count = *s=='u' ? 4 : 8;
3090 s++;
3091
3092 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3093 outpos = p-PyUnicode_AS_UNICODE(v);
3094 for (x = 0, i = 0; i < count; ++i, ++s) {
3095 c = (unsigned char)*s;
3096 if (!isxdigit(c)) {
3097 endinpos = s-starts;
3098 if (unicode_decode_call_errorhandler(
3099 errors, &errorHandler,
3100 "rawunicodeescape", "truncated \\uXXXX",
3101 starts, size, &startinpos, &endinpos, &exc, &s,
3102 &v, &outpos, &p))
3103 goto onError;
3104 goto nextByte;
3105 }
3106 x = (x<<4) & ~0xF;
3107 if (c >= '0' && c <= '9')
3108 x += c - '0';
3109 else if (c >= 'a' && c <= 'f')
3110 x += 10 + c - 'a';
3111 else
3112 x += 10 + c - 'A';
3113 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003114 if (x <= 0xffff)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003115 /* UCS-2 character */
3116 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003117 else if (x <= 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003118 /* UCS-4 character. Either store directly, or as
3119 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003120#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003121 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003122#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003123 x -= 0x10000L;
3124 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3125 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003126#endif
3127 } else {
3128 endinpos = s-starts;
3129 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003130 if (unicode_decode_call_errorhandler(
3131 errors, &errorHandler,
3132 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003133 starts, size, &startinpos, &endinpos, &exc, &s,
3134 &v, &outpos, &p))
3135 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003136 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003137 nextByte:
3138 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003140 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003141 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 Py_XDECREF(errorHandler);
3143 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003145
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148 Py_XDECREF(errorHandler);
3149 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 return NULL;
3151}
3152
3153PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003154 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155{
3156 PyObject *repr;
3157 char *p;
3158 char *q;
3159
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003160 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003161#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003162 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003163#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003164 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003165#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003166
Neal Norwitze7d8be82008-07-31 17:17:14 +00003167 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003168 return PyErr_NoMemory();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003169
Neal Norwitze7d8be82008-07-31 17:17:14 +00003170 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 if (repr == NULL)
3172 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003173 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003174 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003176 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 while (size-- > 0) {
3178 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003180 /* Map 32-bit characters to '\Uxxxxxxxx' */
3181 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003182 *p++ = '\\';
3183 *p++ = 'U';
3184 *p++ = hexdigit[(ch >> 28) & 0xf];
3185 *p++ = hexdigit[(ch >> 24) & 0xf];
3186 *p++ = hexdigit[(ch >> 20) & 0xf];
3187 *p++ = hexdigit[(ch >> 16) & 0xf];
3188 *p++ = hexdigit[(ch >> 12) & 0xf];
3189 *p++ = hexdigit[(ch >> 8) & 0xf];
3190 *p++ = hexdigit[(ch >> 4) & 0xf];
3191 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003194#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003195 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3196 if (ch >= 0xD800 && ch < 0xDC00) {
3197 Py_UNICODE ch2;
3198 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003199
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003200 ch2 = *s++;
3201 size--;
3202 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3203 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3204 *p++ = '\\';
3205 *p++ = 'U';
3206 *p++ = hexdigit[(ucs >> 28) & 0xf];
3207 *p++ = hexdigit[(ucs >> 24) & 0xf];
3208 *p++ = hexdigit[(ucs >> 20) & 0xf];
3209 *p++ = hexdigit[(ucs >> 16) & 0xf];
3210 *p++ = hexdigit[(ucs >> 12) & 0xf];
3211 *p++ = hexdigit[(ucs >> 8) & 0xf];
3212 *p++ = hexdigit[(ucs >> 4) & 0xf];
3213 *p++ = hexdigit[ucs & 0xf];
3214 continue;
3215 }
3216 /* Fall through: isolated surrogates are copied as-is */
3217 s--;
3218 size++;
3219 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003220#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003221 /* Map 16-bit characters to '\uxxxx' */
3222 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 *p++ = '\\';
3224 *p++ = 'u';
3225 *p++ = hexdigit[(ch >> 12) & 0xf];
3226 *p++ = hexdigit[(ch >> 8) & 0xf];
3227 *p++ = hexdigit[(ch >> 4) & 0xf];
3228 *p++ = hexdigit[ch & 15];
3229 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003230 /* Copy everything else as-is */
3231 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 *p++ = (char) ch;
3233 }
3234 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003235 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return repr;
3237}
3238
3239PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3240{
3241 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003242 PyErr_BadArgument();
3243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 }
3245 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003246 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247}
3248
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003249/* --- Unicode Internal Codec ------------------------------------------- */
3250
3251PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003252 Py_ssize_t size,
3253 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003254{
3255 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003256 Py_ssize_t startinpos;
3257 Py_ssize_t endinpos;
3258 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003259 PyUnicodeObject *v;
3260 Py_UNICODE *p;
3261 const char *end;
3262 const char *reason;
3263 PyObject *errorHandler = NULL;
3264 PyObject *exc = NULL;
3265
Neal Norwitzd43069c2006-01-08 01:12:10 +00003266#ifdef Py_UNICODE_WIDE
3267 Py_UNICODE unimax = PyUnicode_GetMax();
3268#endif
3269
Armin Rigo7ccbca92006-10-04 12:17:45 +00003270 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003271 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3272 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003273 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003274 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003275 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003276 p = PyUnicode_AS_UNICODE(v);
3277 end = s + size;
3278
3279 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003280 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003281 /* We have to sanity check the raw data, otherwise doom looms for
3282 some malformed UCS-4 data. */
3283 if (
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003284#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003285 *p > unimax || *p < 0 ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003286#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003287 end-s < Py_UNICODE_SIZE
3288 )
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003289 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003290 startinpos = s - starts;
3291 if (end-s < Py_UNICODE_SIZE) {
3292 endinpos = end-starts;
3293 reason = "truncated input";
3294 }
3295 else {
3296 endinpos = s - starts + Py_UNICODE_SIZE;
3297 reason = "illegal code point (> 0x10FFFF)";
3298 }
3299 outpos = p - PyUnicode_AS_UNICODE(v);
3300 if (unicode_decode_call_errorhandler(
3301 errors, &errorHandler,
3302 "unicode_internal", reason,
3303 starts, size, &startinpos, &endinpos, &exc, &s,
Benjamin Peterson828a7062008-12-27 17:05:29 +00003304 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003305 goto onError;
3306 }
3307 }
3308 else {
3309 p++;
3310 s += Py_UNICODE_SIZE;
3311 }
3312 }
3313
Martin v. Löwis412fb672006-04-13 06:34:32 +00003314 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003315 goto onError;
3316 Py_XDECREF(errorHandler);
3317 Py_XDECREF(exc);
3318 return (PyObject *)v;
3319
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003320 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003321 Py_XDECREF(v);
3322 Py_XDECREF(errorHandler);
3323 Py_XDECREF(exc);
3324 return NULL;
3325}
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327/* --- Latin-1 Codec ------------------------------------------------------ */
3328
3329PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003330 Py_ssize_t size,
3331 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332{
3333 PyUnicodeObject *v;
3334 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003335
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003337 if (size == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003338 Py_UNICODE r = *(unsigned char*)s;
3339 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003340 }
3341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 v = _PyUnicode_New(size);
3343 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003344 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003346 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 p = PyUnicode_AS_UNICODE(v);
3348 while (size-- > 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003349 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003351
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003352 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 Py_XDECREF(v);
3354 return NULL;
3355}
3356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357/* create or adjust a UnicodeEncodeError */
3358static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003359 const char *encoding,
3360 const Py_UNICODE *unicode, Py_ssize_t size,
3361 Py_ssize_t startpos, Py_ssize_t endpos,
3362 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 if (*exceptionObject == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003365 *exceptionObject = PyUnicodeEncodeError_Create(
3366 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 }
3368 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003369 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3370 goto onError;
3371 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3372 goto onError;
3373 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3374 goto onError;
3375 return;
3376 onError:
3377 Py_DECREF(*exceptionObject);
3378 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 }
3380}
3381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382/* raises a UnicodeEncodeError */
3383static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003384 const char *encoding,
3385 const Py_UNICODE *unicode, Py_ssize_t size,
3386 Py_ssize_t startpos, Py_ssize_t endpos,
3387 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388{
3389 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003390 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003392 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393}
3394
3395/* error handling callback helper:
3396 build arguments, call the callback and check the arguments,
3397 put the result into newpos and return the replacement string, which
3398 has to be freed by the caller */
3399static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003400 PyObject **errorHandler,
3401 const char *encoding, const char *reason,
3402 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3403 Py_ssize_t startpos, Py_ssize_t endpos,
3404 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003406 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407
3408 PyObject *restuple;
3409 PyObject *resunicode;
3410
3411 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003412 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 }
3416
3417 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003418 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421
3422 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003423 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003425 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00003427 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003428 Py_DECREF(restuple);
3429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 }
3431 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003432 &resunicode, newpos)) {
3433 Py_DECREF(restuple);
3434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 }
3436 if (*newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003437 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003438 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3440 Py_DECREF(restuple);
3441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 Py_INCREF(resunicode);
3444 Py_DECREF(restuple);
3445 return resunicode;
3446}
3447
3448static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003449 Py_ssize_t size,
3450 const char *errors,
3451 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452{
3453 /* output object */
3454 PyObject *res;
3455 /* pointers to the beginning and end+1 of input */
3456 const Py_UNICODE *startp = p;
3457 const Py_UNICODE *endp = p + size;
3458 /* pointer to the beginning of the unencodable characters */
3459 /* const Py_UNICODE *badp = NULL; */
3460 /* pointer into the output */
3461 char *str;
3462 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003463 Py_ssize_t respos = 0;
3464 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003465 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3466 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 PyObject *errorHandler = NULL;
3468 PyObject *exc = NULL;
3469 /* the following variable is used for caching string comparisons
3470 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3471 int known_errorHandler = -1;
3472
3473 /* allocate enough for a simple encoding without
3474 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003475 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 if (res == NULL)
3477 goto onError;
3478 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003479 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003480 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 ressize = size;
3482
3483 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003484 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003486 /* can we encode this? */
3487 if (c<limit) {
3488 /* no overflow check, because we know that the space is enough */
3489 *str++ = (char)c;
3490 ++p;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003491 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003492 else {
3493 Py_ssize_t unicodepos = p-startp;
3494 Py_ssize_t requiredsize;
3495 PyObject *repunicode;
3496 Py_ssize_t repsize;
3497 Py_ssize_t newpos;
3498 Py_ssize_t respos;
3499 Py_UNICODE *uni2;
3500 /* startpos for collecting unencodable chars */
3501 const Py_UNICODE *collstart = p;
3502 const Py_UNICODE *collend = p;
3503 /* find all unecodable characters */
3504 while ((collend < endp) && ((*collend)>=limit))
3505 ++collend;
3506 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3507 if (known_errorHandler==-1) {
3508 if ((errors==NULL) || (!strcmp(errors, "strict")))
3509 known_errorHandler = 1;
3510 else if (!strcmp(errors, "replace"))
3511 known_errorHandler = 2;
3512 else if (!strcmp(errors, "ignore"))
3513 known_errorHandler = 3;
3514 else if (!strcmp(errors, "xmlcharrefreplace"))
3515 known_errorHandler = 4;
3516 else
3517 known_errorHandler = 0;
3518 }
3519 switch (known_errorHandler) {
3520 case 1: /* strict */
3521 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3522 goto onError;
3523 case 2: /* replace */
3524 while (collstart++<collend)
3525 *str++ = '?'; /* fall through */
3526 case 3: /* ignore */
3527 p = collend;
3528 break;
3529 case 4: /* xmlcharrefreplace */
3530 respos = str-PyString_AS_STRING(res);
3531 /* determine replacement size (temporarily (mis)uses p) */
3532 for (p = collstart, repsize = 0; p < collend; ++p) {
3533 if (*p<10)
3534 repsize += 2+1+1;
3535 else if (*p<100)
3536 repsize += 2+2+1;
3537 else if (*p<1000)
3538 repsize += 2+3+1;
3539 else if (*p<10000)
3540 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003541#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003542 else
3543 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003544#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003545 else if (*p<100000)
3546 repsize += 2+5+1;
3547 else if (*p<1000000)
3548 repsize += 2+6+1;
3549 else
3550 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003551#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003552 }
3553 requiredsize = respos+repsize+(endp-collend);
3554 if (requiredsize > ressize) {
3555 if (requiredsize<2*ressize)
3556 requiredsize = 2*ressize;
3557 if (_PyString_Resize(&res, requiredsize))
3558 goto onError;
3559 str = PyString_AS_STRING(res) + respos;
3560 ressize = requiredsize;
3561 }
3562 /* generate replacement (temporarily (mis)uses p) */
3563 for (p = collstart; p < collend; ++p) {
3564 str += sprintf(str, "&#%d;", (int)*p);
3565 }
3566 p = collend;
3567 break;
3568 default:
3569 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3570 encoding, reason, startp, size, &exc,
3571 collstart-startp, collend-startp, &newpos);
3572 if (repunicode == NULL)
3573 goto onError;
3574 /* need more space? (at least enough for what we
3575 have+the replacement+the rest of the string, so
3576 we won't have to check space for encodable characters) */
3577 respos = str-PyString_AS_STRING(res);
3578 repsize = PyUnicode_GET_SIZE(repunicode);
3579 requiredsize = respos+repsize+(endp-collend);
3580 if (requiredsize > ressize) {
3581 if (requiredsize<2*ressize)
3582 requiredsize = 2*ressize;
3583 if (_PyString_Resize(&res, requiredsize)) {
3584 Py_DECREF(repunicode);
3585 goto onError;
3586 }
3587 str = PyString_AS_STRING(res) + respos;
3588 ressize = requiredsize;
3589 }
3590 /* check if there is anything unencodable in the replacement
3591 and copy it to the output */
3592 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3593 c = *uni2;
3594 if (c >= limit) {
3595 raise_encode_exception(&exc, encoding, startp, size,
3596 unicodepos, unicodepos+1, reason);
3597 Py_DECREF(repunicode);
3598 goto onError;
3599 }
3600 *str = (char)c;
3601 }
3602 p = startp + newpos;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003603 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003604 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003605 }
3606 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003608 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 if (respos<ressize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003610 /* If this falls res will be NULL */
3611 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 Py_XDECREF(errorHandler);
3613 Py_XDECREF(exc);
3614 return res;
3615
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003616 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 Py_XDECREF(res);
3618 Py_XDECREF(errorHandler);
3619 Py_XDECREF(exc);
3620 return NULL;
3621}
3622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003624 Py_ssize_t size,
3625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628}
3629
3630PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3631{
3632 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003633 PyErr_BadArgument();
3634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 }
3636 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003637 PyUnicode_GET_SIZE(unicode),
3638 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639}
3640
3641/* --- 7-bit ASCII Codec -------------------------------------------------- */
3642
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003644 Py_ssize_t size,
3645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 PyUnicodeObject *v;
3649 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003650 Py_ssize_t startinpos;
3651 Py_ssize_t endinpos;
3652 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 const char *e;
3654 PyObject *errorHandler = NULL;
3655 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003656
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003658 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003659 Py_UNICODE r = *(unsigned char*)s;
3660 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003661 }
Tim Petersced69f82003-09-16 20:30:58 +00003662
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 v = _PyUnicode_New(size);
3664 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003667 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 e = s + size;
3670 while (s < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003671 register unsigned char c = (unsigned char)*s;
3672 if (c < 128) {
3673 *p++ = c;
3674 ++s;
3675 }
3676 else {
3677 startinpos = s-starts;
3678 endinpos = startinpos + 1;
3679 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3680 if (unicode_decode_call_errorhandler(
3681 errors, &errorHandler,
3682 "ascii", "ordinal not in range(128)",
3683 starts, size, &startinpos, &endinpos, &exc, &s,
3684 &v, &outpos, &p))
3685 goto onError;
3686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003688 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003689 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3690 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_XDECREF(errorHandler);
3692 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003694
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003695 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 Py_XDECREF(errorHandler);
3698 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 return NULL;
3700}
3701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003703 Py_ssize_t size,
3704 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707}
3708
3709PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3710{
3711 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003712 PyErr_BadArgument();
3713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 }
3715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003716 PyUnicode_GET_SIZE(unicode),
3717 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718}
3719
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003720#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003721
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003722/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003723
Hirokazu Yamamoto68e075e2009-03-21 13:04:41 +00003724#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003725#define NEED_RETRY
3726#endif
3727
3728/* XXX This code is limited to "true" double-byte encodings, as
3729 a) it assumes an incomplete character consists of a single byte, and
3730 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003731 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003732
3733static int is_dbcs_lead_byte(const char *s, int offset)
3734{
3735 const char *curr = s + offset;
3736
3737 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003738 const char *prev = CharPrev(s, curr);
3739 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003740 }
3741 return 0;
3742}
3743
3744/*
3745 * Decode MBCS string into unicode object. If 'final' is set, converts
3746 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3747 */
3748static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003749 const char *s, /* MBCS string */
3750 int size, /* sizeof MBCS string */
3751 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003752{
3753 Py_UNICODE *p;
3754 Py_ssize_t n = 0;
3755 int usize = 0;
3756
3757 assert(size >= 0);
3758
3759 /* Skip trailing lead-byte unless 'final' is set */
3760 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003761 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003762
3763 /* First get the size of the result */
3764 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003765 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3766 if (usize == 0) {
3767 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3768 return -1;
3769 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003770 }
3771
3772 if (*v == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003773 /* Create unicode object */
3774 *v = _PyUnicode_New(usize);
3775 if (*v == NULL)
3776 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003777 }
3778 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003779 /* Extend unicode object */
3780 n = PyUnicode_GET_SIZE(*v);
3781 if (_PyUnicode_Resize(v, n + usize) < 0)
3782 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003783 }
3784
3785 /* Do the conversion */
3786 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003787 p = PyUnicode_AS_UNICODE(*v) + n;
3788 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3789 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3790 return -1;
3791 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003792 }
3793
3794 return size;
3795}
3796
3797PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003798 Py_ssize_t size,
3799 const char *errors,
3800 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003801{
3802 PyUnicodeObject *v = NULL;
3803 int done;
3804
3805 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003806 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003807
3808#ifdef NEED_RETRY
3809 retry:
3810 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003811 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003812 else
3813#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003814 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003815
3816 if (done < 0) {
3817 Py_XDECREF(v);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003818 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003819 }
3820
3821 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003822 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003823
3824#ifdef NEED_RETRY
3825 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003826 s += done;
3827 size -= done;
3828 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003829 }
3830#endif
3831
3832 return (PyObject *)v;
3833}
3834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003835PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003836 Py_ssize_t size,
3837 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3840}
3841
3842/*
3843 * Convert unicode into string object (MBCS).
3844 * Returns 0 if succeed, -1 otherwise.
3845 */
3846static int encode_mbcs(PyObject **repr,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003847 const Py_UNICODE *p, /* unicode */
3848 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003849{
3850 int mbcssize = 0;
3851 Py_ssize_t n = 0;
3852
3853 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003854
3855 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003856 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003857 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3858 if (mbcssize == 0) {
3859 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3860 return -1;
3861 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003862 }
3863
Martin v. Löwisd8251432006-06-14 05:21:04 +00003864 if (*repr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003865 /* Create string object */
3866 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3867 if (*repr == NULL)
3868 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003869 }
3870 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003871 /* Extend string object */
3872 n = PyString_Size(*repr);
3873 if (_PyString_Resize(repr, n + mbcssize) < 0)
3874 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003875 }
3876
3877 /* Do the conversion */
3878 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003879 char *s = PyString_AS_STRING(*repr) + n;
3880 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882 return -1;
3883 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003884 }
3885
3886 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003887}
3888
3889PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003890 Py_ssize_t size,
3891 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003892{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 PyObject *repr = NULL;
3894 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003895
Martin v. Löwisd8251432006-06-14 05:21:04 +00003896#ifdef NEED_RETRY
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003897 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003899 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900 else
3901#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003902 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903
Martin v. Löwisd8251432006-06-14 05:21:04 +00003904 if (ret < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003905 Py_XDECREF(repr);
3906 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003907 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908
3909#ifdef NEED_RETRY
3910 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003911 p += INT_MAX;
3912 size -= INT_MAX;
3913 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 }
3915#endif
3916
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003917 return repr;
3918}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003919
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003920PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3921{
3922 if (!PyUnicode_Check(unicode)) {
3923 PyErr_BadArgument();
3924 return NULL;
3925 }
3926 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003927 PyUnicode_GET_SIZE(unicode),
3928 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003929}
3930
Martin v. Löwisd8251432006-06-14 05:21:04 +00003931#undef NEED_RETRY
3932
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003933#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003934
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935/* --- Character Mapping Codec -------------------------------------------- */
3936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003938 Py_ssize_t size,
3939 PyObject *mapping,
3940 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003943 Py_ssize_t startinpos;
3944 Py_ssize_t endinpos;
3945 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 PyUnicodeObject *v;
3948 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003949 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 PyObject *errorHandler = NULL;
3951 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003952 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003954
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 /* Default to Latin-1 */
3956 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003957 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958
3959 v = _PyUnicode_New(size);
3960 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003961 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003963 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003966 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003967 mapstring = PyUnicode_AS_UNICODE(mapping);
3968 maplen = PyUnicode_GET_SIZE(mapping);
3969 while (s < e) {
3970 unsigned char ch = *s;
3971 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003973 if (ch < maplen)
3974 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003976 if (x == 0xfffe) {
3977 /* undefined mapping */
3978 outpos = p-PyUnicode_AS_UNICODE(v);
3979 startinpos = s-starts;
3980 endinpos = startinpos+1;
3981 if (unicode_decode_call_errorhandler(
3982 errors, &errorHandler,
3983 "charmap", "character maps to <undefined>",
3984 starts, size, &startinpos, &endinpos, &exc, &s,
3985 &v, &outpos, &p)) {
3986 goto onError;
3987 }
3988 continue;
3989 }
3990 *p++ = x;
3991 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003992 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003993 }
3994 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003995 while (s < e) {
3996 unsigned char ch = *s;
3997 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003998
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003999 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4000 w = PyInt_FromLong((long)ch);
4001 if (w == NULL)
4002 goto onError;
4003 x = PyObject_GetItem(mapping, w);
4004 Py_DECREF(w);
4005 if (x == NULL) {
4006 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4007 /* No mapping found means: mapping is undefined. */
4008 PyErr_Clear();
4009 x = Py_None;
4010 Py_INCREF(x);
4011 } else
4012 goto onError;
4013 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004014
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004015 /* Apply mapping */
4016 if (PyInt_Check(x)) {
4017 long value = PyInt_AS_LONG(x);
4018 if (value < 0 || value > 65535) {
4019 PyErr_SetString(PyExc_TypeError,
4020 "character mapping must be in range(65536)");
4021 Py_DECREF(x);
4022 goto onError;
4023 }
4024 *p++ = (Py_UNICODE)value;
4025 }
4026 else if (x == Py_None) {
4027 /* undefined mapping */
4028 outpos = p-PyUnicode_AS_UNICODE(v);
4029 startinpos = s-starts;
4030 endinpos = startinpos+1;
4031 if (unicode_decode_call_errorhandler(
4032 errors, &errorHandler,
4033 "charmap", "character maps to <undefined>",
4034 starts, size, &startinpos, &endinpos, &exc, &s,
4035 &v, &outpos, &p)) {
4036 Py_DECREF(x);
4037 goto onError;
4038 }
4039 Py_DECREF(x);
4040 continue;
4041 }
4042 else if (PyUnicode_Check(x)) {
4043 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004044
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004045 if (targetsize == 1)
4046 /* 1-1 mapping */
4047 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004048
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004049 else if (targetsize > 1) {
4050 /* 1-n mapping */
4051 if (targetsize > extrachars) {
4052 /* resize first */
4053 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4054 Py_ssize_t needed = (targetsize - extrachars) + \
4055 (targetsize << 2);
4056 extrachars += needed;
4057 /* XXX overflow detection missing */
4058 if (_PyUnicode_Resize(&v,
4059 PyUnicode_GET_SIZE(v) + needed) < 0) {
4060 Py_DECREF(x);
4061 goto onError;
4062 }
4063 p = PyUnicode_AS_UNICODE(v) + oldpos;
4064 }
4065 Py_UNICODE_COPY(p,
4066 PyUnicode_AS_UNICODE(x),
4067 targetsize);
4068 p += targetsize;
4069 extrachars -= targetsize;
4070 }
4071 /* 1-0 mapping: skip the character */
4072 }
4073 else {
4074 /* wrong return value */
4075 PyErr_SetString(PyExc_TypeError,
4076 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004077 Py_DECREF(x);
4078 goto onError;
4079 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004080 Py_DECREF(x);
4081 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
4084 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004085 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4086 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 Py_XDECREF(errorHandler);
4088 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004090
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004091 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 Py_XDECREF(errorHandler);
4093 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 Py_XDECREF(v);
4095 return NULL;
4096}
4097
Martin v. Löwis3f767792006-06-04 19:36:28 +00004098/* Charmap encoding: the lookup table */
4099
4100struct encoding_map{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004101 PyObject_HEAD
4102 unsigned char level1[32];
4103 int count2, count3;
4104 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004105};
4106
4107static PyObject*
4108encoding_map_size(PyObject *obj, PyObject* args)
4109{
4110 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004111 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004112 128*map->count3);
4113}
4114
4115static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004116 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004117 PyDoc_STR("Return the size (in bytes) of this object") },
4118 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004119};
4120
4121static void
4122encoding_map_dealloc(PyObject* o)
4123{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004124 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004125}
4126
4127static PyTypeObject EncodingMapType = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004128 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004129 "EncodingMap", /*tp_name*/
4130 sizeof(struct encoding_map), /*tp_basicsize*/
4131 0, /*tp_itemsize*/
4132 /* methods */
4133 encoding_map_dealloc, /*tp_dealloc*/
4134 0, /*tp_print*/
4135 0, /*tp_getattr*/
4136 0, /*tp_setattr*/
4137 0, /*tp_compare*/
4138 0, /*tp_repr*/
4139 0, /*tp_as_number*/
4140 0, /*tp_as_sequence*/
4141 0, /*tp_as_mapping*/
4142 0, /*tp_hash*/
4143 0, /*tp_call*/
4144 0, /*tp_str*/
4145 0, /*tp_getattro*/
4146 0, /*tp_setattro*/
4147 0, /*tp_as_buffer*/
4148 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4149 0, /*tp_doc*/
4150 0, /*tp_traverse*/
4151 0, /*tp_clear*/
4152 0, /*tp_richcompare*/
4153 0, /*tp_weaklistoffset*/
4154 0, /*tp_iter*/
4155 0, /*tp_iternext*/
4156 encoding_map_methods, /*tp_methods*/
4157 0, /*tp_members*/
4158 0, /*tp_getset*/
4159 0, /*tp_base*/
4160 0, /*tp_dict*/
4161 0, /*tp_descr_get*/
4162 0, /*tp_descr_set*/
4163 0, /*tp_dictoffset*/
4164 0, /*tp_init*/
4165 0, /*tp_alloc*/
4166 0, /*tp_new*/
4167 0, /*tp_free*/
4168 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004169};
4170
4171PyObject*
4172PyUnicode_BuildEncodingMap(PyObject* string)
4173{
4174 Py_UNICODE *decode;
4175 PyObject *result;
4176 struct encoding_map *mresult;
4177 int i;
4178 int need_dict = 0;
4179 unsigned char level1[32];
4180 unsigned char level2[512];
4181 unsigned char *mlevel1, *mlevel2, *mlevel3;
4182 int count2 = 0, count3 = 0;
4183
4184 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4185 PyErr_BadArgument();
4186 return NULL;
4187 }
4188 decode = PyUnicode_AS_UNICODE(string);
4189 memset(level1, 0xFF, sizeof level1);
4190 memset(level2, 0xFF, sizeof level2);
4191
4192 /* If there isn't a one-to-one mapping of NULL to \0,
4193 or if there are non-BMP characters, we need to use
4194 a mapping dictionary. */
4195 if (decode[0] != 0)
4196 need_dict = 1;
4197 for (i = 1; i < 256; i++) {
4198 int l1, l2;
4199 if (decode[i] == 0
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004200#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004201 || decode[i] > 0xFFFF
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004202#endif
4203 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004204 need_dict = 1;
4205 break;
4206 }
4207 if (decode[i] == 0xFFFE)
4208 /* unmapped character */
4209 continue;
4210 l1 = decode[i] >> 11;
4211 l2 = decode[i] >> 7;
4212 if (level1[l1] == 0xFF)
4213 level1[l1] = count2++;
4214 if (level2[l2] == 0xFF)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004215 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004216 }
4217
4218 if (count2 >= 0xFF || count3 >= 0xFF)
4219 need_dict = 1;
4220
4221 if (need_dict) {
4222 PyObject *result = PyDict_New();
4223 PyObject *key, *value;
4224 if (!result)
4225 return NULL;
4226 for (i = 0; i < 256; i++) {
4227 key = value = NULL;
4228 key = PyInt_FromLong(decode[i]);
4229 value = PyInt_FromLong(i);
4230 if (!key || !value)
4231 goto failed1;
4232 if (PyDict_SetItem(result, key, value) == -1)
4233 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004234 Py_DECREF(key);
4235 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004236 }
4237 return result;
4238 failed1:
4239 Py_XDECREF(key);
4240 Py_XDECREF(value);
4241 Py_DECREF(result);
4242 return NULL;
4243 }
4244
4245 /* Create a three-level trie */
4246 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4247 16*count2 + 128*count3 - 1);
4248 if (!result)
4249 return PyErr_NoMemory();
4250 PyObject_Init(result, &EncodingMapType);
4251 mresult = (struct encoding_map*)result;
4252 mresult->count2 = count2;
4253 mresult->count3 = count3;
4254 mlevel1 = mresult->level1;
4255 mlevel2 = mresult->level23;
4256 mlevel3 = mresult->level23 + 16*count2;
4257 memcpy(mlevel1, level1, 32);
4258 memset(mlevel2, 0xFF, 16*count2);
4259 memset(mlevel3, 0, 128*count3);
4260 count3 = 0;
4261 for (i = 1; i < 256; i++) {
4262 int o1, o2, o3, i2, i3;
4263 if (decode[i] == 0xFFFE)
4264 /* unmapped character */
4265 continue;
4266 o1 = decode[i]>>11;
4267 o2 = (decode[i]>>7) & 0xF;
4268 i2 = 16*mlevel1[o1] + o2;
4269 if (mlevel2[i2] == 0xFF)
4270 mlevel2[i2] = count3++;
4271 o3 = decode[i] & 0x7F;
4272 i3 = 128*mlevel2[i2] + o3;
4273 mlevel3[i3] = i;
4274 }
4275 return result;
4276}
4277
4278static int
4279encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4280{
4281 struct encoding_map *map = (struct encoding_map*)mapping;
4282 int l1 = c>>11;
4283 int l2 = (c>>7) & 0xF;
4284 int l3 = c & 0x7F;
4285 int i;
4286
4287#ifdef Py_UNICODE_WIDE
4288 if (c > 0xFFFF) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004289 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004290 }
4291#endif
4292 if (c == 0)
4293 return 0;
4294 /* level 1*/
4295 i = map->level1[l1];
4296 if (i == 0xFF) {
4297 return -1;
4298 }
4299 /* level 2*/
4300 i = map->level23[16*i+l2];
4301 if (i == 0xFF) {
4302 return -1;
4303 }
4304 /* level 3 */
4305 i = map->level23[16*map->count2 + 128*i + l3];
4306 if (i == 0) {
4307 return -1;
4308 }
4309 return i;
4310}
4311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312/* Lookup the character ch in the mapping. If the character
4313 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004314 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 PyObject *w = PyInt_FromLong((long)c);
4318 PyObject *x;
4319
4320 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004321 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 x = PyObject_GetItem(mapping, w);
4323 Py_DECREF(w);
4324 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004325 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4326 /* No mapping found means: mapping is undefined. */
4327 PyErr_Clear();
4328 x = Py_None;
4329 Py_INCREF(x);
4330 return x;
4331 } else
4332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004334 else if (x == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004335 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004337 long value = PyInt_AS_LONG(x);
4338 if (value < 0 || value > 255) {
4339 PyErr_SetString(PyExc_TypeError,
4340 "character mapping must be in range(256)");
4341 Py_DECREF(x);
4342 return NULL;
4343 }
4344 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004346 else if (PyString_Check(x))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004347 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004349 /* wrong return value */
4350 PyErr_SetString(PyExc_TypeError,
4351 "character mapping must return integer, None or str");
4352 Py_DECREF(x);
4353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 }
4355}
4356
Martin v. Löwis3f767792006-06-04 19:36:28 +00004357static int
4358charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4359{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004360 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4361 /* exponentially overallocate to minimize reallocations */
4362 if (requiredsize < 2*outsize)
4363 requiredsize = 2*outsize;
4364 if (_PyString_Resize(outobj, requiredsize)) {
4365 return 0;
4366 }
4367 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004368}
4369
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004370typedef enum charmapencode_result {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004371 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373/* lookup the character, put the result in the output string and adjust
4374 various state variables. Reallocate the output string if not enough
4375 space is available. Return a new reference to the object that
4376 was put in the output buffer, or Py_None, if the mapping was undefined
4377 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004378 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004380charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004381 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004383 PyObject *rep;
4384 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004385 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
Christian Heimese93237d2007-12-19 02:37:44 +00004387 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004388 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004389 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004390 if (res == -1)
4391 return enc_FAILED;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004392 if (outsize<requiredsize)
4393 if (!charmapencode_resize(outobj, outpos, requiredsize))
4394 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004395 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004396 outstart[(*outpos)++] = (char)res;
4397 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004398 }
4399
4400 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 if (rep==NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004402 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004403 else if (rep==Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004404 Py_DECREF(rep);
4405 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004406 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004407 if (PyInt_Check(rep)) {
4408 Py_ssize_t requiredsize = *outpos+1;
4409 if (outsize<requiredsize)
4410 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4411 Py_DECREF(rep);
4412 return enc_EXCEPTION;
4413 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004414 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004415 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004416 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004417 else {
4418 const char *repchars = PyString_AS_STRING(rep);
4419 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4420 Py_ssize_t requiredsize = *outpos+repsize;
4421 if (outsize<requiredsize)
4422 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4423 Py_DECREF(rep);
4424 return enc_EXCEPTION;
4425 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004426 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004427 memcpy(outstart + *outpos, repchars, repsize);
4428 *outpos += repsize;
4429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 }
Georg Brandl9f167602006-06-04 21:46:16 +00004431 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004432 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433}
4434
4435/* handle an error in PyUnicode_EncodeCharmap
4436 Return 0 on success, -1 on error */
4437static
4438int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004441 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443{
4444 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445 Py_ssize_t repsize;
4446 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 Py_UNICODE *uni2;
4448 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t collstartpos = *inpos;
4450 Py_ssize_t collendpos = *inpos+1;
4451 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 char *encoding = "charmap";
4453 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004454 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 /* find all unencodable characters */
4457 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004458 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004459 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004460 int res = encoding_map_lookup(p[collendpos], mapping);
4461 if (res != -1)
4462 break;
4463 ++collendpos;
4464 continue;
4465 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004466
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004467 rep = charmapencode_lookup(p[collendpos], mapping);
4468 if (rep==NULL)
4469 return -1;
4470 else if (rep!=Py_None) {
4471 Py_DECREF(rep);
4472 break;
4473 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004474 Py_DECREF(rep);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004475 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 }
4477 /* cache callback name lookup
4478 * (if not done yet, i.e. it's the first error) */
4479 if (*known_errorHandler==-1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004480 if ((errors==NULL) || (!strcmp(errors, "strict")))
4481 *known_errorHandler = 1;
4482 else if (!strcmp(errors, "replace"))
4483 *known_errorHandler = 2;
4484 else if (!strcmp(errors, "ignore"))
4485 *known_errorHandler = 3;
4486 else if (!strcmp(errors, "xmlcharrefreplace"))
4487 *known_errorHandler = 4;
4488 else
4489 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 }
4491 switch (*known_errorHandler) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004492 case 1: /* strict */
4493 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4494 return -1;
4495 case 2: /* replace */
4496 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004497 x = charmapencode_output('?', mapping, res, respos);
4498 if (x==enc_EXCEPTION) {
4499 return -1;
4500 }
4501 else if (x==enc_FAILED) {
4502 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4503 return -1;
4504 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004505 }
4506 /* fall through */
4507 case 3: /* ignore */
4508 *inpos = collendpos;
4509 break;
4510 case 4: /* xmlcharrefreplace */
4511 /* generate replacement (temporarily (mis)uses p) */
4512 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004513 char buffer[2+29+1+1];
4514 char *cp;
4515 sprintf(buffer, "&#%d;", (int)p[collpos]);
4516 for (cp = buffer; *cp; ++cp) {
4517 x = charmapencode_output(*cp, mapping, res, respos);
4518 if (x==enc_EXCEPTION)
4519 return -1;
4520 else if (x==enc_FAILED) {
4521 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4522 return -1;
4523 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004524 }
4525 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004526 *inpos = collendpos;
4527 break;
4528 default:
4529 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004530 encoding, reason, p, size, exceptionObject,
4531 collstartpos, collendpos, &newpos);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004532 if (repunicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004533 return -1;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004534 /* generate replacement */
4535 repsize = PyUnicode_GET_SIZE(repunicode);
4536 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004537 x = charmapencode_output(*uni2, mapping, res, respos);
4538 if (x==enc_EXCEPTION) {
4539 return -1;
4540 }
4541 else if (x==enc_FAILED) {
4542 Py_DECREF(repunicode);
4543 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4544 return -1;
4545 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004546 }
4547 *inpos = newpos;
4548 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 }
4550 return 0;
4551}
4552
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004554 Py_ssize_t size,
4555 PyObject *mapping,
4556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 /* output object */
4559 PyObject *res = NULL;
4560 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
4566 /* the following variable is used for caching string comparisons
4567 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4568 * 3=ignore, 4=xmlcharrefreplace */
4569 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570
4571 /* Default to Latin-1 */
4572 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004573 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 /* allocate enough for a simple encoding without
4576 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004577 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 if (res == NULL)
4579 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004580 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004581 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 while (inpos<size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004584 /* try to encode it */
4585 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4586 if (x==enc_EXCEPTION) /* error */
4587 goto onError;
4588 if (x==enc_FAILED) { /* unencodable character */
4589 if (charmap_encoding_error(p, size, &inpos, mapping,
4590 &exc,
4591 &known_errorHandler, &errorHandler, errors,
4592 &res, &respos)) {
4593 goto onError;
4594 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004595 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004596 else
4597 /* done with this character => adjust input position */
4598 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004602 if (respos<PyString_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004603 if (_PyString_Resize(&res, respos))
4604 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 }
4606 Py_XDECREF(exc);
4607 Py_XDECREF(errorHandler);
4608 return res;
4609
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004610 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 Py_XDECREF(res);
4612 Py_XDECREF(exc);
4613 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 return NULL;
4615}
4616
4617PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004618 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619{
4620 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004621 PyErr_BadArgument();
4622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 }
4624 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004625 PyUnicode_GET_SIZE(unicode),
4626 mapping,
4627 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628}
4629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630/* create or adjust a UnicodeTranslateError */
4631static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004632 const Py_UNICODE *unicode, Py_ssize_t size,
4633 Py_ssize_t startpos, Py_ssize_t endpos,
4634 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004637 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004638 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 }
4640 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004641 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4642 goto onError;
4643 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4644 goto onError;
4645 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4646 goto onError;
4647 return;
4648 onError:
4649 Py_DECREF(*exceptionObject);
4650 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 }
4652}
4653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654/* raises a UnicodeTranslateError */
4655static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004656 const Py_UNICODE *unicode, Py_ssize_t size,
4657 Py_ssize_t startpos, Py_ssize_t endpos,
4658 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659{
4660 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004661 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004663 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664}
4665
4666/* error handling callback helper:
4667 build arguments, call the callback and check the arguments,
4668 put the result into newpos and return the replacement string, which
4669 has to be freed by the caller */
4670static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004671 PyObject **errorHandler,
4672 const char *reason,
4673 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4674 Py_ssize_t startpos, Py_ssize_t endpos,
4675 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004677 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678
Martin v. Löwis412fb672006-04-13 06:34:32 +00004679 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 PyObject *restuple;
4681 PyObject *resunicode;
4682
4683 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004684 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 }
4688
4689 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004690 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693
4694 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004695 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00004699 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004700 Py_DECREF(restuple);
4701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 }
4703 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004704 &resunicode, &i_newpos)) {
4705 Py_DECREF(restuple);
4706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 if (i_newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004709 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 else
4711 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004712 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004713 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4714 Py_DECREF(restuple);
4715 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_INCREF(resunicode);
4718 Py_DECREF(restuple);
4719 return resunicode;
4720}
4721
4722/* Lookup the character ch in the mapping and put the result in result,
4723 which must be decrefed by the caller.
4724 Return 0 on success, -1 on error */
4725static
4726int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4727{
4728 PyObject *w = PyInt_FromLong((long)c);
4729 PyObject *x;
4730
4731 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004732 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 x = PyObject_GetItem(mapping, w);
4734 Py_DECREF(w);
4735 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004736 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4737 /* No mapping found means: use 1:1 mapping. */
4738 PyErr_Clear();
4739 *result = NULL;
4740 return 0;
4741 } else
4742 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 }
4744 else if (x == Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004745 *result = x;
4746 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 }
4748 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004749 long value = PyInt_AS_LONG(x);
4750 long max = PyUnicode_GetMax();
4751 if (value < 0 || value > max) {
4752 PyErr_Format(PyExc_TypeError,
4753 "character mapping must be in range(0x%lx)", max+1);
4754 Py_DECREF(x);
4755 return -1;
4756 }
4757 *result = x;
4758 return 0;
4759 }
4760 else if (PyUnicode_Check(x)) {
4761 *result = x;
4762 return 0;
4763 }
4764 else {
4765 /* wrong return value */
4766 PyErr_SetString(PyExc_TypeError,
4767 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004768 Py_DECREF(x);
4769 return -1;
4770 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771}
4772/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004773 if not reallocate and adjust various state variables.
4774 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775static
Walter Dörwald4894c302003-10-24 14:25:28 +00004776int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004777 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004780 if (requiredsize > oldsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004781 /* remember old output position */
4782 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4783 /* exponentially overallocate to minimize reallocations */
4784 if (requiredsize < 2 * oldsize)
4785 requiredsize = 2 * oldsize;
4786 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4787 return -1;
4788 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 }
4790 return 0;
4791}
4792/* lookup the character, put the result in the output string and adjust
4793 various state variables. Return a new reference to the object that
4794 was put in the output buffer in *result, or Py_None, if the mapping was
4795 undefined (in which case no character was written).
4796 The called must decref result.
4797 Return 0 on success, -1 on error. */
4798static
Walter Dörwald4894c302003-10-24 14:25:28 +00004799int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004800 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4801 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802{
Walter Dörwald4894c302003-10-24 14:25:28 +00004803 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004804 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 if (*res==NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004806 /* not found => default to 1:1 mapping */
4807 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 }
4809 else if (*res==Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004810 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 else if (PyInt_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004812 /* no overflow check, because we know that the space is enough */
4813 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 }
4815 else if (PyUnicode_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004816 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4817 if (repsize==1) {
4818 /* no overflow check, because we know that the space is enough */
4819 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4820 }
4821 else if (repsize!=0) {
4822 /* more than one character */
4823 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4824 (insize - (curinp-startinp)) +
4825 repsize - 1;
4826 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4827 return -1;
4828 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4829 *outp += repsize;
4830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 }
4832 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004833 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 return 0;
4835}
4836
4837PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004838 Py_ssize_t size,
4839 PyObject *mapping,
4840 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 /* output object */
4843 PyObject *res = NULL;
4844 /* pointers to the beginning and end+1 of input */
4845 const Py_UNICODE *startp = p;
4846 const Py_UNICODE *endp = p + size;
4847 /* pointer into the output */
4848 Py_UNICODE *str;
4849 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 char *reason = "character maps to <undefined>";
4852 PyObject *errorHandler = NULL;
4853 PyObject *exc = NULL;
4854 /* the following variable is used for caching string comparisons
4855 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4856 * 3=ignore, 4=xmlcharrefreplace */
4857 int known_errorHandler = -1;
4858
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 if (mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004860 PyErr_BadArgument();
4861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863
4864 /* allocate enough for a simple 1:1 translation without
4865 replacements, if we need more, we'll resize */
4866 res = PyUnicode_FromUnicode(NULL, size);
4867 if (res == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004870 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004874 /* try to encode it */
4875 PyObject *x = NULL;
4876 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4877 Py_XDECREF(x);
4878 goto onError;
4879 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004880 Py_XDECREF(x);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004881 if (x!=Py_None) /* it worked => adjust input pointer */
4882 ++p;
4883 else { /* untranslatable character */
4884 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4885 Py_ssize_t repsize;
4886 Py_ssize_t newpos;
4887 Py_UNICODE *uni2;
4888 /* startpos for collecting untranslatable chars */
4889 const Py_UNICODE *collstart = p;
4890 const Py_UNICODE *collend = p+1;
4891 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004893 /* find all untranslatable characters */
4894 while (collend < endp) {
4895 if (charmaptranslate_lookup(*collend, mapping, &x))
4896 goto onError;
4897 Py_XDECREF(x);
4898 if (x!=Py_None)
4899 break;
4900 ++collend;
4901 }
4902 /* cache callback name lookup
4903 * (if not done yet, i.e. it's the first error) */
4904 if (known_errorHandler==-1) {
4905 if ((errors==NULL) || (!strcmp(errors, "strict")))
4906 known_errorHandler = 1;
4907 else if (!strcmp(errors, "replace"))
4908 known_errorHandler = 2;
4909 else if (!strcmp(errors, "ignore"))
4910 known_errorHandler = 3;
4911 else if (!strcmp(errors, "xmlcharrefreplace"))
4912 known_errorHandler = 4;
4913 else
4914 known_errorHandler = 0;
4915 }
4916 switch (known_errorHandler) {
4917 case 1: /* strict */
4918 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004919 goto onError;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004920 case 2: /* replace */
4921 /* No need to check for space, this is a 1:1 replacement */
4922 for (coll = collstart; coll<collend; ++coll)
4923 *str++ = '?';
4924 /* fall through */
4925 case 3: /* ignore */
4926 p = collend;
4927 break;
4928 case 4: /* xmlcharrefreplace */
4929 /* generate replacement (temporarily (mis)uses p) */
4930 for (p = collstart; p < collend; ++p) {
4931 char buffer[2+29+1+1];
4932 char *cp;
4933 sprintf(buffer, "&#%d;", (int)*p);
4934 if (charmaptranslate_makespace(&res, &str,
4935 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4936 goto onError;
4937 for (cp = buffer; *cp; ++cp)
4938 *str++ = *cp;
4939 }
4940 p = collend;
4941 break;
4942 default:
4943 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4944 reason, startp, size, &exc,
4945 collstart-startp, collend-startp, &newpos);
4946 if (repunicode == NULL)
4947 goto onError;
4948 /* generate replacement */
4949 repsize = PyUnicode_GET_SIZE(repunicode);
4950 if (charmaptranslate_makespace(&res, &str,
4951 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4952 Py_DECREF(repunicode);
4953 goto onError;
4954 }
4955 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4956 *str++ = *uni2;
4957 p = startp + newpos;
4958 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004959 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004960 }
4961 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 /* Resize if we allocated to much */
4963 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004964 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004965 if (PyUnicode_Resize(&res, respos) < 0)
4966 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 }
4968 Py_XDECREF(exc);
4969 Py_XDECREF(errorHandler);
4970 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004972 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 Py_XDECREF(res);
4974 Py_XDECREF(exc);
4975 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 return NULL;
4977}
4978
4979PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004980 PyObject *mapping,
4981 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982{
4983 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004984
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 str = PyUnicode_FromObject(str);
4986 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004989 PyUnicode_GET_SIZE(str),
4990 mapping,
4991 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 Py_DECREF(str);
4993 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004994
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 Py_XDECREF(str);
4997 return NULL;
4998}
Tim Petersced69f82003-09-16 20:30:58 +00004999
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000/* --- Decimal Encoder ---------------------------------------------------- */
5001
5002int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005003 Py_ssize_t length,
5004 char *output,
5005 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005006{
5007 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 PyObject *errorHandler = NULL;
5009 PyObject *exc = NULL;
5010 const char *encoding = "decimal";
5011 const char *reason = "invalid decimal Unicode string";
5012 /* the following variable is used for caching string comparisons
5013 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5014 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005015
5016 if (output == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005017 PyErr_BadArgument();
5018 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005019 }
5020
5021 p = s;
5022 end = s + length;
5023 while (p < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005024 register Py_UNICODE ch = *p;
5025 int decimal;
5026 PyObject *repunicode;
5027 Py_ssize_t repsize;
5028 Py_ssize_t newpos;
5029 Py_UNICODE *uni2;
5030 Py_UNICODE *collstart;
5031 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005032
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005033 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005034 *output++ = ' ';
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005035 ++p;
5036 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005037 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005038 decimal = Py_UNICODE_TODECIMAL(ch);
5039 if (decimal >= 0) {
5040 *output++ = '0' + decimal;
5041 ++p;
5042 continue;
5043 }
5044 if (0 < ch && ch < 256) {
5045 *output++ = (char)ch;
5046 ++p;
5047 continue;
5048 }
5049 /* All other characters are considered unencodable */
5050 collstart = p;
5051 collend = p+1;
5052 while (collend < end) {
5053 if ((0 < *collend && *collend < 256) ||
5054 !Py_UNICODE_ISSPACE(*collend) ||
5055 Py_UNICODE_TODECIMAL(*collend))
5056 break;
5057 }
5058 /* cache callback name lookup
5059 * (if not done yet, i.e. it's the first error) */
5060 if (known_errorHandler==-1) {
5061 if ((errors==NULL) || (!strcmp(errors, "strict")))
5062 known_errorHandler = 1;
5063 else if (!strcmp(errors, "replace"))
5064 known_errorHandler = 2;
5065 else if (!strcmp(errors, "ignore"))
5066 known_errorHandler = 3;
5067 else if (!strcmp(errors, "xmlcharrefreplace"))
5068 known_errorHandler = 4;
5069 else
5070 known_errorHandler = 0;
5071 }
5072 switch (known_errorHandler) {
5073 case 1: /* strict */
5074 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5075 goto onError;
5076 case 2: /* replace */
5077 for (p = collstart; p < collend; ++p)
5078 *output++ = '?';
5079 /* fall through */
5080 case 3: /* ignore */
5081 p = collend;
5082 break;
5083 case 4: /* xmlcharrefreplace */
5084 /* generate replacement (temporarily (mis)uses p) */
5085 for (p = collstart; p < collend; ++p)
5086 output += sprintf(output, "&#%d;", (int)*p);
5087 p = collend;
5088 break;
5089 default:
5090 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5091 encoding, reason, s, length, &exc,
5092 collstart-s, collend-s, &newpos);
5093 if (repunicode == NULL)
5094 goto onError;
5095 /* generate replacement */
5096 repsize = PyUnicode_GET_SIZE(repunicode);
5097 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5098 Py_UNICODE ch = *uni2;
5099 if (Py_UNICODE_ISSPACE(ch))
5100 *output++ = ' ';
5101 else {
5102 decimal = Py_UNICODE_TODECIMAL(ch);
5103 if (decimal >= 0)
5104 *output++ = '0' + decimal;
5105 else if (0 < ch && ch < 256)
5106 *output++ = (char)ch;
5107 else {
5108 Py_DECREF(repunicode);
5109 raise_encode_exception(&exc, encoding,
5110 s, length, collstart-s, collend-s, reason);
5111 goto onError;
5112 }
5113 }
5114 }
5115 p = s + newpos;
5116 Py_DECREF(repunicode);
5117 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005118 }
5119 /* 0-terminate the output string */
5120 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005121 Py_XDECREF(exc);
5122 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005123 return 0;
5124
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005125 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 Py_XDECREF(exc);
5127 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005128 return -1;
5129}
5130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131/* --- Helpers ------------------------------------------------------------ */
5132
Eric Smitha9f7d622008-02-17 19:46:49 +00005133#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005134
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005135#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005136
Fredrik Lundha50d2012006-05-26 17:04:58 +00005137#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005138
5139#include "stringlib/count.h"
5140#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005141#include "stringlib/partition.h"
5142
Fredrik Lundhc8162812006-05-26 19:33:03 +00005143/* helper macro to fixup start/end slice values */
5144#define FIX_START_END(obj) \
5145 if (start < 0) \
5146 start += (obj)->length; \
5147 if (start < 0) \
5148 start = 0; \
5149 if (end > (obj)->length) \
5150 end = (obj)->length; \
5151 if (end < 0) \
5152 end += (obj)->length; \
5153 if (end < 0) \
5154 end = 0;
5155
Martin v. Löwis18e16552006-02-15 17:27:45 +00005156Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005157 PyObject *substr,
5158 Py_ssize_t start,
5159 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005162 PyUnicodeObject* str_obj;
5163 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005164
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005165 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5166 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005167 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005168 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5169 if (!sub_obj) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005170 Py_DECREF(str_obj);
5171 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 }
Tim Petersced69f82003-09-16 20:30:58 +00005173
Fredrik Lundhc8162812006-05-26 19:33:03 +00005174 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005175
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005176 result = stringlib_count(
5177 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5178 );
5179
5180 Py_DECREF(sub_obj);
5181 Py_DECREF(str_obj);
5182
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return result;
5184}
5185
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005187 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005188 Py_ssize_t start,
5189 Py_ssize_t end,
5190 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005192 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005193
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005194 str = PyUnicode_FromObject(str);
5195 if (!str)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005196 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005197 sub = PyUnicode_FromObject(sub);
5198 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005199 Py_DECREF(str);
5200 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 }
Tim Petersced69f82003-09-16 20:30:58 +00005202
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005203 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005204 result = stringlib_find_slice(
5205 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5206 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5207 start, end
5208 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005209 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005210 result = stringlib_rfind_slice(
5211 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5212 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5213 start, end
5214 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005215
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005216 Py_DECREF(str);
5217 Py_DECREF(sub);
5218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return result;
5220}
5221
Tim Petersced69f82003-09-16 20:30:58 +00005222static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223int tailmatch(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005224 PyUnicodeObject *substring,
5225 Py_ssize_t start,
5226 Py_ssize_t end,
5227 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 if (substring->length == 0)
5230 return 1;
5231
Fredrik Lundhc8162812006-05-26 19:33:03 +00005232 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
5234 end -= substring->length;
5235 if (end < start)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005236 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237
5238 if (direction > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005239 if (Py_UNICODE_MATCH(self, end, substring))
5240 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 } else {
5242 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005243 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 }
5245
5246 return 0;
5247}
5248
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005250 PyObject *substr,
5251 Py_ssize_t start,
5252 Py_ssize_t end,
5253 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 str = PyUnicode_FromObject(str);
5258 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005259 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 substr = PyUnicode_FromObject(substr);
5261 if (substr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005262 Py_DECREF(str);
5263 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 }
Tim Petersced69f82003-09-16 20:30:58 +00005265
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005267 (PyUnicodeObject *)substr,
5268 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 Py_DECREF(str);
5270 Py_DECREF(substr);
5271 return result;
5272}
5273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274/* Apply fixfct filter to the Unicode object self and return a
5275 reference to the modified object */
5276
Tim Petersced69f82003-09-16 20:30:58 +00005277static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005279 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280{
5281
5282 PyUnicodeObject *u;
5283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005284 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005286 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005287
5288 Py_UNICODE_COPY(u->str, self->str, self->length);
5289
Tim Peters7a29bd52001-09-12 03:03:31 +00005290 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005291 /* fixfct should return TRUE if it modified the buffer. If
5292 FALSE, return a reference to the original buffer instead
5293 (to save space, not time) */
5294 Py_INCREF(self);
5295 Py_DECREF(u);
5296 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 }
5298 return (PyObject*) u;
5299}
5300
Tim Petersced69f82003-09-16 20:30:58 +00005301static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302int fixupper(PyUnicodeObject *self)
5303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 Py_UNICODE *s = self->str;
5306 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005307
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005309 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005310
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005311 ch = Py_UNICODE_TOUPPER(*s);
5312 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005314 *s = ch;
5315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 s++;
5317 }
5318
5319 return status;
5320}
5321
Tim Petersced69f82003-09-16 20:30:58 +00005322static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323int fixlower(PyUnicodeObject *self)
5324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005325 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 Py_UNICODE *s = self->str;
5327 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005330 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005331
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005332 ch = Py_UNICODE_TOLOWER(*s);
5333 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005335 *s = ch;
5336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 s++;
5338 }
5339
5340 return status;
5341}
5342
Tim Petersced69f82003-09-16 20:30:58 +00005343static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344int fixswapcase(PyUnicodeObject *self)
5345{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 Py_UNICODE *s = self->str;
5348 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005349
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 while (len-- > 0) {
5351 if (Py_UNICODE_ISUPPER(*s)) {
5352 *s = Py_UNICODE_TOLOWER(*s);
5353 status = 1;
5354 } else if (Py_UNICODE_ISLOWER(*s)) {
5355 *s = Py_UNICODE_TOUPPER(*s);
5356 status = 1;
5357 }
5358 s++;
5359 }
5360
5361 return status;
5362}
5363
Tim Petersced69f82003-09-16 20:30:58 +00005364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365int fixcapitalize(PyUnicodeObject *self)
5366{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005367 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005368 Py_UNICODE *s = self->str;
5369 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005371 if (len == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005372 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005373 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005374 *s = Py_UNICODE_TOUPPER(*s);
5375 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005377 s++;
5378 while (--len > 0) {
5379 if (Py_UNICODE_ISUPPER(*s)) {
5380 *s = Py_UNICODE_TOLOWER(*s);
5381 status = 1;
5382 }
5383 s++;
5384 }
5385 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386}
5387
5388static
5389int fixtitle(PyUnicodeObject *self)
5390{
5391 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5392 register Py_UNICODE *e;
5393 int previous_is_cased;
5394
5395 /* Shortcut for single character strings */
5396 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005397 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5398 if (*p != ch) {
5399 *p = ch;
5400 return 1;
5401 }
5402 else
5403 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 }
Tim Petersced69f82003-09-16 20:30:58 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 e = p + PyUnicode_GET_SIZE(self);
5407 previous_is_cased = 0;
5408 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005409 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005410
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005411 if (previous_is_cased)
5412 *p = Py_UNICODE_TOLOWER(ch);
5413 else
5414 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005415
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005416 if (Py_UNICODE_ISLOWER(ch) ||
5417 Py_UNICODE_ISUPPER(ch) ||
5418 Py_UNICODE_ISTITLE(ch))
5419 previous_is_cased = 1;
5420 else
5421 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
5423 return 1;
5424}
5425
Tim Peters8ce9f162004-08-27 01:49:32 +00005426PyObject *
5427PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428{
Tim Peters8ce9f162004-08-27 01:49:32 +00005429 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005430 const Py_UNICODE blank = ' ';
5431 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005432 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005434 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5435 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005436 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5437 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005438 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005439 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005440 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Tim Peters05eba1f2004-08-27 21:32:02 +00005442 fseq = PySequence_Fast(seq, "");
5443 if (fseq == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005444 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005445 }
5446
Tim Peters91879ab2004-08-27 22:35:44 +00005447 /* Grrrr. A codec may be invoked to convert str objects to
5448 * Unicode, and so it's possible to call back into Python code
5449 * during PyUnicode_FromObject(), and so it's possible for a sick
5450 * codec to change the size of fseq (if seq is a list). Therefore
5451 * we have to keep refetching the size -- can't assume seqlen
5452 * is invariant.
5453 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005454 seqlen = PySequence_Fast_GET_SIZE(fseq);
5455 /* If empty sequence, return u"". */
5456 if (seqlen == 0) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005457 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5458 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005459 }
5460 /* If singleton sequence with an exact Unicode, return that. */
5461 if (seqlen == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005462 item = PySequence_Fast_GET_ITEM(fseq, 0);
5463 if (PyUnicode_CheckExact(item)) {
5464 Py_INCREF(item);
5465 res = (PyUnicodeObject *)item;
5466 goto Done;
5467 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005468 }
5469
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 /* At least two items to join, or one that isn't exact Unicode. */
5471 if (seqlen > 1) {
5472 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005473 if (separator == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005474 sep = &blank;
5475 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005477 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005478 internal_separator = PyUnicode_FromObject(separator);
5479 if (internal_separator == NULL)
5480 goto onError;
5481 sep = PyUnicode_AS_UNICODE(internal_separator);
5482 seplen = PyUnicode_GET_SIZE(internal_separator);
5483 /* In case PyUnicode_FromObject() mutated seq. */
5484 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005485 }
5486 }
5487
5488 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005489 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005491 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 res_p = PyUnicode_AS_UNICODE(res);
5493 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005494
Tim Peters05eba1f2004-08-27 21:32:02 +00005495 for (i = 0; i < seqlen; ++i) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005496 Py_ssize_t itemlen;
5497 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005498
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005499 item = PySequence_Fast_GET_ITEM(fseq, i);
5500 /* Convert item to Unicode. */
5501 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5502 PyErr_Format(PyExc_TypeError,
5503 "sequence item %zd: expected string or Unicode,"
5504 " %.80s found",
5505 i, Py_TYPE(item)->tp_name);
5506 goto onError;
5507 }
5508 item = PyUnicode_FromObject(item);
5509 if (item == NULL)
5510 goto onError;
5511 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005512
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005513 /* In case PyUnicode_FromObject() mutated seq. */
5514 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005515
Tim Peters8ce9f162004-08-27 01:49:32 +00005516 /* Make sure we have enough space for the separator and the item. */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005517 itemlen = PyUnicode_GET_SIZE(item);
5518 new_res_used = res_used + itemlen;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005519 if (new_res_used < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005520 goto Overflow;
5521 if (i < seqlen - 1) {
5522 new_res_used += seplen;
5523 if (new_res_used < 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005524 goto Overflow;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005525 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005526 if (new_res_used > res_alloc) {
5527 /* double allocated size until it's big enough */
5528 do {
5529 res_alloc += res_alloc;
5530 if (res_alloc <= 0)
5531 goto Overflow;
5532 } while (new_res_used > res_alloc);
5533 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5534 Py_DECREF(item);
5535 goto onError;
5536 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005537 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005538 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005539
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005540 /* Copy item, and maybe the separator. */
5541 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5542 res_p += itemlen;
5543 if (i < seqlen - 1) {
5544 Py_UNICODE_COPY(res_p, sep, seplen);
5545 res_p += seplen;
5546 }
5547 Py_DECREF(item);
5548 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005549 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005550
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 /* Shrink res to match the used area; this probably can't fail,
5552 * but it's cheap to check.
5553 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005554 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005555 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005556
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005557 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005559 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 return (PyObject *)res;
5561
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005562 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005563 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005564 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005565 Py_DECREF(item);
5566 /* fall through */
5567
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005568 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005569 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005570 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005571 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 return NULL;
5573}
5574
Tim Petersced69f82003-09-16 20:30:58 +00005575static
5576PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005577 Py_ssize_t left,
5578 Py_ssize_t right,
5579 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580{
5581 PyUnicodeObject *u;
5582
5583 if (left < 0)
5584 left = 0;
5585 if (right < 0)
5586 right = 0;
5587
Tim Peters7a29bd52001-09-12 03:03:31 +00005588 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 Py_INCREF(self);
5590 return self;
5591 }
5592
Neal Norwitze7d8be82008-07-31 17:17:14 +00005593 if (left > PY_SSIZE_T_MAX - self->length ||
5594 right > PY_SSIZE_T_MAX - (left + self->length)) {
5595 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5596 return NULL;
5597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 u = _PyUnicode_New(left + self->length + right);
5599 if (u) {
5600 if (left)
5601 Py_UNICODE_FILL(u->str, fill, left);
5602 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5603 if (right)
5604 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5605 }
5606
5607 return u;
5608}
5609
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005610#define SPLIT_APPEND(data, left, right) \
5611 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5612 if (!str) \
5613 goto onError; \
5614 if (PyList_Append(list, str)) { \
5615 Py_DECREF(str); \
5616 goto onError; \
5617 } \
5618 else \
5619 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
5621static
5622PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005623 PyObject *list,
5624 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 register Py_ssize_t i;
5627 register Py_ssize_t j;
5628 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005630 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
5632 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005633 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005634 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005635 i++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005636 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005637 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5638 i++;
5639 if (j < i) {
5640 if (maxcount-- <= 0)
5641 break;
5642 SPLIT_APPEND(buf, j, i);
5643 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5644 i++;
5645 j = i;
5646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
5648 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005649 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 }
5651 return list;
5652
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005653 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 Py_DECREF(list);
5655 return NULL;
5656}
5657
5658PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005659 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005661 register Py_ssize_t i;
5662 register Py_ssize_t j;
5663 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 PyObject *list;
5665 PyObject *str;
5666 Py_UNICODE *data;
5667
5668 string = PyUnicode_FromObject(string);
5669 if (string == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 data = PyUnicode_AS_UNICODE(string);
5672 len = PyUnicode_GET_SIZE(string);
5673
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 list = PyList_New(0);
5675 if (!list)
5676 goto onError;
5677
5678 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005679 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005680
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005681 /* Find a line and append it */
5682 while (i < len && !BLOOM_LINEBREAK(data[i]))
5683 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005685 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005686 eol = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005687 if (i < len) {
5688 if (data[i] == '\r' && i + 1 < len &&
5689 data[i+1] == '\n')
5690 i += 2;
5691 else
5692 i++;
5693 if (keepends)
5694 eol = i;
5695 }
5696 SPLIT_APPEND(data, j, eol);
5697 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
5699 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005700 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 }
5702
5703 Py_DECREF(string);
5704 return list;
5705
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005706 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005707 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 Py_DECREF(string);
5709 return NULL;
5710}
5711
Tim Petersced69f82003-09-16 20:30:58 +00005712static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005714 PyObject *list,
5715 Py_UNICODE ch,
5716 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 register Py_ssize_t i;
5719 register Py_ssize_t j;
5720 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005722 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
5724 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005725 if (buf[i] == ch) {
5726 if (maxcount-- <= 0)
5727 break;
5728 SPLIT_APPEND(buf, j, i);
5729 i = j = i + 1;
5730 } else
5731 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 }
5733 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005734 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 }
5736 return list;
5737
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 Py_DECREF(list);
5740 return NULL;
5741}
5742
Tim Petersced69f82003-09-16 20:30:58 +00005743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005745 PyObject *list,
5746 PyUnicodeObject *substring,
5747 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 register Py_ssize_t i;
5750 register Py_ssize_t j;
5751 Py_ssize_t len = self->length;
5752 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 PyObject *str;
5754
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005755 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005756 if (Py_UNICODE_MATCH(self, i, substring)) {
5757 if (maxcount-- <= 0)
5758 break;
5759 SPLIT_APPEND(self->str, j, i);
5760 i = j = i + sublen;
5761 } else
5762 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 }
5764 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005765 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 }
5767 return list;
5768
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005769 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 Py_DECREF(list);
5771 return NULL;
5772}
5773
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774static
5775PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005776 PyObject *list,
5777 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779 register Py_ssize_t i;
5780 register Py_ssize_t j;
5781 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005783 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784
5785 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005786 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005787 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005788 i--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005789 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005790 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5791 i--;
5792 if (j > i) {
5793 if (maxcount-- <= 0)
5794 break;
5795 SPLIT_APPEND(buf, i + 1, j + 1);
5796 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5797 i--;
5798 j = i;
5799 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800 }
5801 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005802 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005804 if (PyList_Reverse(list) < 0)
5805 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005806 return list;
5807
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005808 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809 Py_DECREF(list);
5810 return NULL;
5811}
5812
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005813static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005814PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005815 PyObject *list,
5816 Py_UNICODE ch,
5817 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 register Py_ssize_t i;
5820 register Py_ssize_t j;
5821 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005823 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005824
5825 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005826 if (buf[i] == ch) {
5827 if (maxcount-- <= 0)
5828 break;
5829 SPLIT_APPEND(buf, i + 1, j + 1);
5830 j = i = i - 1;
5831 } else
5832 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005833 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005834 if (j >= -1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005835 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005836 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005837 if (PyList_Reverse(list) < 0)
5838 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005839 return list;
5840
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005841 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842 Py_DECREF(list);
5843 return NULL;
5844}
5845
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005846static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005847PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005848 PyObject *list,
5849 PyUnicodeObject *substring,
5850 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 register Py_ssize_t i;
5853 register Py_ssize_t j;
5854 Py_ssize_t len = self->length;
5855 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856 PyObject *str;
5857
5858 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005859 if (Py_UNICODE_MATCH(self, i, substring)) {
5860 if (maxcount-- <= 0)
5861 break;
5862 SPLIT_APPEND(self->str, i + sublen, j);
5863 j = i;
5864 i -= sublen;
5865 } else
5866 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867 }
5868 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005869 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005871 if (PyList_Reverse(list) < 0)
5872 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873 return list;
5874
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005875 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005876 Py_DECREF(list);
5877 return NULL;
5878}
5879
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880#undef SPLIT_APPEND
5881
5882static
5883PyObject *split(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005884 PyUnicodeObject *substring,
5885 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
5887 PyObject *list;
5888
5889 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005890 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 list = PyList_New(0);
5893 if (!list)
5894 return NULL;
5895
5896 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005897 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
5899 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005900 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901
5902 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005903 Py_DECREF(list);
5904 PyErr_SetString(PyExc_ValueError, "empty separator");
5905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
5907 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005908 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909}
5910
Tim Petersced69f82003-09-16 20:30:58 +00005911static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005912PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005913 PyUnicodeObject *substring,
5914 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005915{
5916 PyObject *list;
5917
5918 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005919 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005920
5921 list = PyList_New(0);
5922 if (!list)
5923 return NULL;
5924
5925 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005926 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005927
5928 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005929 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005930
5931 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005932 Py_DECREF(list);
5933 PyErr_SetString(PyExc_ValueError, "empty separator");
5934 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005935 }
5936 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005937 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005938}
5939
5940static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005942 PyUnicodeObject *str1,
5943 PyUnicodeObject *str2,
5944 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945{
5946 PyUnicodeObject *u;
5947
5948 if (maxcount < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005949 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Fredrik Lundh347ee272006-05-24 16:35:18 +00005951 if (str1->length == str2->length) {
5952 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005953 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005954 if (str1->length == 1) {
5955 /* replace characters */
5956 Py_UNICODE u1, u2;
5957 if (!findchar(self->str, self->length, str1->str[0]))
5958 goto nothing;
5959 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5960 if (!u)
5961 return NULL;
5962 Py_UNICODE_COPY(u->str, self->str, self->length);
5963 u1 = str1->str[0];
5964 u2 = str2->str[0];
5965 for (i = 0; i < u->length; i++)
5966 if (u->str[i] == u1) {
5967 if (--maxcount < 0)
5968 break;
5969 u->str[i] = u2;
5970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005972 i = fastsearch(
5973 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005975 if (i < 0)
5976 goto nothing;
5977 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5978 if (!u)
5979 return NULL;
5980 Py_UNICODE_COPY(u->str, self->str, self->length);
5981 while (i <= self->length - str1->length)
5982 if (Py_UNICODE_MATCH(self, i, str1)) {
5983 if (--maxcount < 0)
5984 break;
5985 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5986 i += str1->length;
5987 } else
5988 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005992 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005993 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 Py_UNICODE *p;
5995
5996 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005997 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 if (n > maxcount)
5999 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006000 if (n == 0)
6001 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006002 /* new_size = self->length + n * (str2->length - str1->length)); */
6003 delta = (str2->length - str1->length);
6004 if (delta == 0) {
6005 new_size = self->length;
6006 } else {
6007 product = n * (str2->length - str1->length);
6008 if ((product / (str2->length - str1->length)) != n) {
6009 PyErr_SetString(PyExc_OverflowError,
6010 "replace string is too long");
6011 return NULL;
6012 }
6013 new_size = self->length + product;
6014 if (new_size < 0) {
6015 PyErr_SetString(PyExc_OverflowError,
6016 "replace string is too long");
6017 return NULL;
6018 }
6019 }
6020 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006021 if (!u)
6022 return NULL;
6023 i = 0;
6024 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006025 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006026 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006027 while (n-- > 0) {
6028 /* look for next match */
6029 j = i;
6030 while (j <= e) {
6031 if (Py_UNICODE_MATCH(self, j, str1))
6032 break;
6033 j++;
6034 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006035 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006036 if (j > e)
6037 break;
6038 /* copy unchanged part [i:j] */
6039 Py_UNICODE_COPY(p, self->str+i, j-i);
6040 p += j - i;
6041 }
6042 /* copy substitution string */
6043 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006044 Py_UNICODE_COPY(p, str2->str, str2->length);
6045 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006046 }
6047 i = j + str1->length;
6048 }
6049 if (i < self->length)
6050 /* copy tail [i:] */
6051 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006052 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006053 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006054 while (n > 0) {
6055 Py_UNICODE_COPY(p, str2->str, str2->length);
6056 p += str2->length;
6057 if (--n <= 0)
6058 break;
6059 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006061 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 }
6063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006065
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006066 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006067 /* nothing to replace; return original string (when possible) */
6068 if (PyUnicode_CheckExact(self)) {
6069 Py_INCREF(self);
6070 return (PyObject *) self;
6071 }
6072 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073}
6074
6075/* --- Unicode Object Methods --------------------------------------------- */
6076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006077PyDoc_STRVAR(title__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006078 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079\n\
6080Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006081characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
6083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006084unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 return fixup(self, fixtitle);
6087}
6088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006089PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006090 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091\n\
6092Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094
6095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006096unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return fixup(self, fixcapitalize);
6099}
6100
6101#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006103 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104\n\
6105Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
6108static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006109unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110{
6111 PyObject *list;
6112 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 /* Split into words */
6116 list = split(self, NULL, -1);
6117 if (!list)
6118 return NULL;
6119
6120 /* Capitalize each word */
6121 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6122 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006123 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 if (item == NULL)
6125 goto onError;
6126 Py_DECREF(PyList_GET_ITEM(list, i));
6127 PyList_SET_ITEM(list, i, item);
6128 }
6129
6130 /* Join the words to form a new string */
6131 item = PyUnicode_Join(NULL, list);
6132
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 Py_DECREF(list);
6135 return (PyObject *)item;
6136}
6137#endif
6138
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006139/* Argument converter. Coerces to a single unicode character */
6140
6141static int
6142convert_uc(PyObject *obj, void *addr)
6143{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006144 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6145 PyObject *uniobj;
6146 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006147
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006148 uniobj = PyUnicode_FromObject(obj);
6149 if (uniobj == NULL) {
6150 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006151 "The fill character cannot be converted to Unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006152 return 0;
6153 }
6154 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6155 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006156 "The fill character must be exactly one character long");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006157 Py_DECREF(uniobj);
6158 return 0;
6159 }
6160 unistr = PyUnicode_AS_UNICODE(uniobj);
6161 *fillcharloc = unistr[0];
6162 Py_DECREF(uniobj);
6163 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006164}
6165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006166PyDoc_STRVAR(center__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006167 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006169Return S centered in a Unicode string of length width. Padding is\n\
6170done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172static PyObject *
6173unicode_center(PyUnicodeObject *self, PyObject *args)
6174{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006175 Py_ssize_t marg, left;
6176 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006177 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178
Thomas Woutersde017742006-02-16 19:34:37 +00006179 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 return NULL;
6181
Tim Peters7a29bd52001-09-12 03:03:31 +00006182 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 Py_INCREF(self);
6184 return (PyObject*) self;
6185 }
6186
6187 marg = width - self->length;
6188 left = marg / 2 + (marg & width & 1);
6189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006190 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191}
6192
Marc-André Lemburge5034372000-08-08 08:04:29 +00006193#if 0
6194
6195/* This code should go into some future Unicode collation support
6196 module. The basic comparison should compare ordinals on a naive
Georg Brandla3c242c2009-10-27 14:19:50 +00006197 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006198
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006199/* speedy UTF-16 code point order comparison */
6200/* gleaned from: */
6201/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6202
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006203static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006204{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006205 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006206 0, 0, 0, 0, 0, 0, 0, 0,
6207 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006208 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006209};
6210
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211static int
6212unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006214 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 Py_UNICODE *s1 = str1->str;
6217 Py_UNICODE *s2 = str2->str;
6218
6219 len1 = str1->length;
6220 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006221
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006223 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006224
6225 c1 = *s1++;
6226 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006227
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006228 if (c1 > (1<<11) * 26)
6229 c1 += utf16Fixup[c1>>11];
6230 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006231 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006232 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006233
6234 if (c1 != c2)
6235 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006236
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006237 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 }
6239
6240 return (len1 < len2) ? -1 : (len1 != len2);
6241}
6242
Marc-André Lemburge5034372000-08-08 08:04:29 +00006243#else
6244
6245static int
6246unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006249
6250 Py_UNICODE *s1 = str1->str;
6251 Py_UNICODE *s2 = str2->str;
6252
6253 len1 = str1->length;
6254 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Marc-André Lemburge5034372000-08-08 08:04:29 +00006256 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006257 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006258
Fredrik Lundh45714e92001-06-26 16:39:36 +00006259 c1 = *s1++;
6260 c2 = *s2++;
6261
6262 if (c1 != c2)
6263 return (c1 < c2) ? -1 : 1;
6264
Marc-André Lemburge5034372000-08-08 08:04:29 +00006265 len1--; len2--;
6266 }
6267
6268 return (len1 < len2) ? -1 : (len1 != len2);
6269}
6270
6271#endif
6272
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273int PyUnicode_Compare(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006274 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275{
6276 PyUnicodeObject *u = NULL, *v = NULL;
6277 int result;
6278
6279 /* Coerce the two arguments */
6280 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6281 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6284 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006285 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Thomas Wouters7e474022000-07-16 12:04:32 +00006287 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 if (v == u) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006289 Py_DECREF(u);
6290 Py_DECREF(v);
6291 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 }
6293
6294 result = unicode_compare(u, v);
6295
6296 Py_DECREF(u);
6297 Py_DECREF(v);
6298 return result;
6299
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006300 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_XDECREF(u);
6302 Py_XDECREF(v);
6303 return -1;
6304}
6305
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006306PyObject *PyUnicode_RichCompare(PyObject *left,
6307 PyObject *right,
6308 int op)
6309{
6310 int result;
6311
6312 result = PyUnicode_Compare(left, right);
6313 if (result == -1 && PyErr_Occurred())
6314 goto onError;
6315
6316 /* Convert the return value to a Boolean */
6317 switch (op) {
6318 case Py_EQ:
6319 result = (result == 0);
6320 break;
6321 case Py_NE:
6322 result = (result != 0);
6323 break;
6324 case Py_LE:
6325 result = (result <= 0);
6326 break;
6327 case Py_GE:
6328 result = (result >= 0);
6329 break;
6330 case Py_LT:
6331 result = (result == -1);
6332 break;
6333 case Py_GT:
6334 result = (result == 1);
6335 break;
6336 }
6337 return PyBool_FromLong(result);
6338
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006339 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006340
6341 /* Standard case
6342
6343 Type errors mean that PyUnicode_FromObject() could not convert
6344 one of the arguments (usually the right hand side) to Unicode,
6345 ie. we can't handle the comparison request. However, it is
6346 possible that the other object knows a comparison method, which
6347 is why we return Py_NotImplemented to give the other object a
6348 chance.
6349
6350 */
6351 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6352 PyErr_Clear();
6353 Py_INCREF(Py_NotImplemented);
6354 return Py_NotImplemented;
6355 }
6356 if (op != Py_EQ && op != Py_NE)
6357 return NULL;
6358
6359 /* Equality comparison.
6360
6361 This is a special case: we silence any PyExc_UnicodeDecodeError
6362 and instead turn it into a PyErr_UnicodeWarning.
6363
6364 */
6365 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6366 return NULL;
6367 PyErr_Clear();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006368 if (PyErr_Warn(PyExc_UnicodeWarning,
6369 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006370 "Unicode equal comparison "
6371 "failed to convert both arguments to Unicode - "
6372 "interpreting them as being unequal" :
6373 "Unicode unequal comparison "
6374 "failed to convert both arguments to Unicode - "
6375 "interpreting them as being unequal"
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006376 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006377 return NULL;
6378 result = (op == Py_NE);
6379 return PyBool_FromLong(result);
6380}
6381
Guido van Rossum403d68b2000-03-13 15:55:09 +00006382int PyUnicode_Contains(PyObject *container,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006383 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006384{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006385 PyObject *str, *sub;
6386 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006387
6388 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006389 sub = PyUnicode_FromObject(element);
6390 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006391 PyErr_SetString(PyExc_TypeError,
6392 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006393 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006394 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006395
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006396 str = PyUnicode_FromObject(container);
6397 if (!str) {
6398 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006399 return -1;
6400 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006401
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006402 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006403
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006404 Py_DECREF(str);
6405 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006406
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006407 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006408}
6409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410/* Concat to string or Unicode object giving a new Unicode object. */
6411
6412PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006413 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414{
6415 PyUnicodeObject *u = NULL, *v = NULL, *w;
6416
6417 /* Coerce the two arguments */
6418 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6419 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6422 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425 /* Shortcuts */
6426 if (v == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006427 Py_DECREF(v);
6428 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 }
6430 if (u == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006431 Py_DECREF(u);
6432 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 }
6434
6435 /* Concat the two Unicode strings */
6436 w = _PyUnicode_New(u->length + v->length);
6437 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006438 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 Py_UNICODE_COPY(w->str, u->str, u->length);
6440 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6441
6442 Py_DECREF(u);
6443 Py_DECREF(v);
6444 return (PyObject *)w;
6445
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006446 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 Py_XDECREF(u);
6448 Py_XDECREF(v);
6449 return NULL;
6450}
6451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006452PyDoc_STRVAR(count__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006453 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006455Return the number of non-overlapping occurrences of substring sub in\n\
6456Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
6459static PyObject *
6460unicode_count(PyUnicodeObject *self, PyObject *args)
6461{
6462 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006463 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006464 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 PyObject *result;
6466
Guido van Rossumb8872e62000-05-09 14:14:27 +00006467 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006468 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 return NULL;
6470
6471 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006472 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006474 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006475
Fredrik Lundhc8162812006-05-26 19:33:03 +00006476 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006478 result = PyInt_FromSsize_t(
6479 stringlib_count(self->str + start, end - start,
6480 substring->str, substring->length)
6481 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
6483 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006484
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return result;
6486}
6487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488PyDoc_STRVAR(encode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006489 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006491Encodes S using the codec registered for encoding. encoding defaults\n\
6492to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006493handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6495'xmlcharrefreplace' as well as any other name registered with\n\
6496codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497
6498static PyObject *
6499unicode_encode(PyUnicodeObject *self, PyObject *args)
6500{
6501 char *encoding = NULL;
6502 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006503 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006504
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6506 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006507 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006508 if (v == NULL)
6509 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006510 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006511 PyErr_Format(PyExc_TypeError,
6512 "encoder did not return a string/unicode object "
6513 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006514 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515 Py_DECREF(v);
6516 return NULL;
6517 }
6518 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006519
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006520 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006521 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006522}
6523
6524PyDoc_STRVAR(decode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006525 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006526\n\
6527Decodes S using the codec registered for encoding. encoding defaults\n\
6528to the default encoding. errors may be given to set a different error\n\
6529handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6530a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6531as well as any other name registerd with codecs.register_error that is\n\
6532able to handle UnicodeDecodeErrors.");
6533
6534static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006535unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006536{
6537 char *encoding = NULL;
6538 char *errors = NULL;
6539 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006540
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006541 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6542 return NULL;
6543 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006544 if (v == NULL)
6545 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006546 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006547 PyErr_Format(PyExc_TypeError,
6548 "decoder did not return a string/unicode object "
6549 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006550 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006551 Py_DECREF(v);
6552 return NULL;
6553 }
6554 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006555
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006556 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558}
6559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006560PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006561 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562\n\
6563Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
6566static PyObject*
6567unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6568{
6569 Py_UNICODE *e;
6570 Py_UNICODE *p;
6571 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006572 Py_UNICODE *qe;
6573 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 PyUnicodeObject *u;
6575 int tabsize = 8;
6576
6577 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Thomas Wouters7e474022000-07-16 12:04:32 +00006580 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006581 i = 0; /* chars up to and including most recent \n or \r */
6582 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6583 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 for (p = self->str; p < e; p++)
6585 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006586 if (tabsize > 0) {
6587 incr = tabsize - (j % tabsize); /* cannot overflow */
6588 if (j > PY_SSIZE_T_MAX - incr)
6589 goto overflow1;
6590 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006591 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006594 if (j > PY_SSIZE_T_MAX - 1)
6595 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 j++;
6597 if (*p == '\n' || *p == '\r') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006598 if (i > PY_SSIZE_T_MAX - j)
6599 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006601 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 }
6603 }
6604
Guido van Rossum5bdff602008-03-11 21:18:06 +00006605 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006606 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 /* Second pass: create output string and fill it */
6609 u = _PyUnicode_New(i + j);
6610 if (!u)
6611 return NULL;
6612
Guido van Rossum5bdff602008-03-11 21:18:06 +00006613 j = 0; /* same as in first pass */
6614 q = u->str; /* next output char */
6615 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
6617 for (p = self->str; p < e; p++)
6618 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006619 if (tabsize > 0) {
6620 i = tabsize - (j % tabsize);
6621 j += i;
6622 while (i--) {
6623 if (q >= qe)
6624 goto overflow2;
6625 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006626 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006627 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006628 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006629 else {
6630 if (q >= qe)
6631 goto overflow2;
6632 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006633 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 if (*p == '\n' || *p == '\r')
6635 j = 0;
6636 }
6637
6638 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006639
6640 overflow2:
6641 Py_DECREF(u);
6642 overflow1:
6643 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645}
6646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647PyDoc_STRVAR(find__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006648 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649\n\
6650Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006651such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652arguments start and end are interpreted as in slice notation.\n\
6653\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656static PyObject *
6657unicode_find(PyUnicodeObject *self, PyObject *args)
6658{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006659 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006660 Py_ssize_t start;
6661 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006662 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
Facundo Batista57d56692007-11-16 18:04:14 +00006664 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006667 result = stringlib_find_slice(
6668 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6669 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6670 start, end
6671 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006674
6675 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
6678static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
6681 if (index < 0 || index >= self->length) {
6682 PyErr_SetString(PyExc_IndexError, "string index out of range");
6683 return NULL;
6684 }
6685
6686 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6687}
6688
6689static long
6690unicode_hash(PyUnicodeObject *self)
6691{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006692 /* Since Unicode objects compare equal to their ASCII string
6693 counterparts, they should use the individual character values
6694 as basis for their hash value. This is needed to assure that
6695 strings and Unicode objects behave in the same way as
6696 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
Martin v. Löwis18e16552006-02-15 17:27:45 +00006698 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006699 register Py_UNICODE *p;
6700 register long x;
6701
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 if (self->hash != -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006703 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006704 len = PyUnicode_GET_SIZE(self);
6705 p = PyUnicode_AS_UNICODE(self);
6706 x = *p << 7;
6707 while (--len >= 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006708 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006709 x ^= PyUnicode_GET_SIZE(self);
6710 if (x == -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006711 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006712 self->hash = x;
6713 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(index__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006717 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
6721static PyObject *
6722unicode_index(PyUnicodeObject *self, PyObject *args)
6723{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006725 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006726 Py_ssize_t start;
6727 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Facundo Batista57d56692007-11-16 18:04:14 +00006729 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006732 result = stringlib_find_slice(
6733 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6734 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6735 start, end
6736 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
6738 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 if (result < 0) {
6741 PyErr_SetString(PyExc_ValueError, "substring not found");
6742 return NULL;
6743 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006744
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748PyDoc_STRVAR(islower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006749 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006755unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758 register const Py_UNICODE *e;
6759 int cased;
6760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 /* Shortcut for single character strings */
6762 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006763 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006765 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006766 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006767 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 cased = 0;
6771 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006772 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006774 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6775 return PyBool_FromLong(0);
6776 else if (!cased && Py_UNICODE_ISLOWER(ch))
6777 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006783 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006785Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006786at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
6788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006789unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6792 register const Py_UNICODE *e;
6793 int cased;
6794
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 /* Shortcut for single character strings */
6796 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006797 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006799 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006800 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006801 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 e = p + PyUnicode_GET_SIZE(self);
6804 cased = 0;
6805 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006806 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006807
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006808 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6809 return PyBool_FromLong(0);
6810 else if (!cased && Py_UNICODE_ISUPPER(ch))
6811 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006817 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006819Return True if S is a titlecased string and there is at least one\n\
6820character in S, i.e. upper- and titlecase characters may only\n\
6821follow uncased characters and lowercase characters only cased ones.\n\
6822Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
6824static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006825unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826{
6827 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6828 register const Py_UNICODE *e;
6829 int cased, previous_is_cased;
6830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 /* Shortcut for single character strings */
6832 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006833 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6834 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006836 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006837 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006838 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 e = p + PyUnicode_GET_SIZE(self);
6841 cased = 0;
6842 previous_is_cased = 0;
6843 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006844 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006845
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006846 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6847 if (previous_is_cased)
6848 return PyBool_FromLong(0);
6849 previous_is_cased = 1;
6850 cased = 1;
6851 }
6852 else if (Py_UNICODE_ISLOWER(ch)) {
6853 if (!previous_is_cased)
6854 return PyBool_FromLong(0);
6855 previous_is_cased = 1;
6856 cased = 1;
6857 }
6858 else
6859 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862}
6863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006864PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006865 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006867Return True if all characters in S are whitespace\n\
6868and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006871unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
6873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6874 register const Py_UNICODE *e;
6875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 /* Shortcut for single character strings */
6877 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006878 Py_UNICODE_ISSPACE(*p))
6879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006881 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006882 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 e = p + PyUnicode_GET_SIZE(self);
6886 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006887 if (!Py_UNICODE_ISSPACE(*p))
6888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891}
6892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006894 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006896Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006897and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898
6899static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006900unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006901{
6902 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6903 register const Py_UNICODE *e;
6904
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006905 /* Shortcut for single character strings */
6906 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006907 Py_UNICODE_ISALPHA(*p))
6908 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006909
6910 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006911 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006912 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913
6914 e = p + PyUnicode_GET_SIZE(self);
6915 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006916 if (!Py_UNICODE_ISALPHA(*p))
6917 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006918 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006923 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006924\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006925Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006926and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006927
6928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930{
6931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6932 register const Py_UNICODE *e;
6933
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006934 /* Shortcut for single character strings */
6935 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006936 Py_UNICODE_ISALNUM(*p))
6937 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006938
6939 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006940 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006941 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006942
6943 e = p + PyUnicode_GET_SIZE(self);
6944 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006945 if (!Py_UNICODE_ISALNUM(*p))
6946 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006947 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006952 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
6957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006958unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959{
6960 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6961 register const Py_UNICODE *e;
6962
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 /* Shortcut for single character strings */
6964 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006965 Py_UNICODE_ISDECIMAL(*p))
6966 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006968 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006969 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006970 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006971
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 e = p + PyUnicode_GET_SIZE(self);
6973 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006974 if (!Py_UNICODE_ISDECIMAL(*p))
6975 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006981 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006983Return True if all characters in S are digits\n\
6984and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006987unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988{
6989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6990 register const Py_UNICODE *e;
6991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 /* Shortcut for single character strings */
6993 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006994 Py_UNICODE_ISDIGIT(*p))
6995 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006997 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006998 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006999 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007000
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 e = p + PyUnicode_GET_SIZE(self);
7002 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007003 if (!Py_UNICODE_ISDIGIT(*p))
7004 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007006 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007}
7008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007010 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007013False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014
7015static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007016unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017{
7018 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7019 register const Py_UNICODE *e;
7020
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 /* Shortcut for single character strings */
7022 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007023 Py_UNICODE_ISNUMERIC(*p))
7024 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007026 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007027 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007028 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007029
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 e = p + PyUnicode_GET_SIZE(self);
7031 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007032 if (!Py_UNICODE_ISNUMERIC(*p))
7033 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007035 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(join__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007039 "S.join(sequence) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
7041Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007042sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048}
7049
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051unicode_length(PyUnicodeObject *self)
7052{
7053 return self->length;
7054}
7055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007056PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007057 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007059Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007060done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
7062static PyObject *
7063unicode_ljust(PyUnicodeObject *self, PyObject *args)
7064{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007065 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007066 Py_UNICODE fillchar = ' ';
7067
Martin v. Löwis412fb672006-04-13 06:34:32 +00007068 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 return NULL;
7070
Tim Peters7a29bd52001-09-12 03:03:31 +00007071 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 Py_INCREF(self);
7073 return (PyObject*) self;
7074 }
7075
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007076 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(lower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007080 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007082Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
7084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007085unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 return fixup(self, fixlower);
7088}
7089
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090#define LEFTSTRIP 0
7091#define RIGHTSTRIP 1
7092#define BOTHSTRIP 2
7093
7094/* Arrays indexed by above */
7095static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7096
7097#define STRIPNAME(i) (stripformat[i]+3)
7098
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099/* externally visible for str.strip(unicode) */
7100PyObject *
7101_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7102{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007103 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7104 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7105 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7106 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7107 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007109 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007110
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007111 i = 0;
7112 if (striptype != RIGHTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007113 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7114 i++;
7115 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007116 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007118 j = len;
7119 if (striptype != LEFTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007120 do {
7121 j--;
7122 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7123 j++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007124 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007126 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007127 Py_INCREF(self);
7128 return (PyObject*)self;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007129 }
7130 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007131 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007132}
7133
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134
7135static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007136do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007138 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7139 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007140
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007141 i = 0;
7142 if (striptype != RIGHTSTRIP) {
7143 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7144 i++;
7145 }
7146 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007148 j = len;
7149 if (striptype != LEFTSTRIP) {
7150 do {
7151 j--;
7152 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7153 j++;
7154 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007155
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007156 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7157 Py_INCREF(self);
7158 return (PyObject*)self;
7159 }
7160 else
7161 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164
7165static PyObject *
7166do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7167{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007168 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007170 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7171 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007172
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007173 if (sep != NULL && sep != Py_None) {
7174 if (PyUnicode_Check(sep))
7175 return _PyUnicode_XStrip(self, striptype, sep);
7176 else if (PyString_Check(sep)) {
7177 PyObject *res;
7178 sep = PyUnicode_FromObject(sep);
7179 if (sep==NULL)
7180 return NULL;
7181 res = _PyUnicode_XStrip(self, striptype, sep);
7182 Py_DECREF(sep);
7183 return res;
7184 }
7185 else {
7186 PyErr_Format(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007187 "%s arg must be None, unicode or str",
7188 STRIPNAME(striptype));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007189 return NULL;
7190 }
7191 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007192
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007193 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194}
7195
7196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007197PyDoc_STRVAR(strip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007198 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199\n\
7200Return a copy of the string S with leading and trailing\n\
7201whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007202If chars is given and not None, remove characters in chars instead.\n\
7203If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204
7205static PyObject *
7206unicode_strip(PyUnicodeObject *self, PyObject *args)
7207{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007208 if (PyTuple_GET_SIZE(args) == 0)
7209 return do_strip(self, BOTHSTRIP); /* Common case */
7210 else
7211 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212}
7213
7214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007215PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007216 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007217\n\
7218Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007219If chars is given and not None, remove characters in chars instead.\n\
7220If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007221
7222static PyObject *
7223unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7224{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007225 if (PyTuple_GET_SIZE(args) == 0)
7226 return do_strip(self, LEFTSTRIP); /* Common case */
7227 else
7228 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229}
7230
7231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007232PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007233 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007234\n\
7235Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007236If chars is given and not None, remove characters in chars instead.\n\
7237If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238
7239static PyObject *
7240unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7241{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007242 if (PyTuple_GET_SIZE(args) == 0)
7243 return do_strip(self, RIGHTSTRIP); /* Common case */
7244 else
7245 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246}
7247
7248
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
7252 PyUnicodeObject *u;
7253 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007255 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
7257 if (len < 0)
7258 len = 0;
7259
Tim Peters7a29bd52001-09-12 03:03:31 +00007260 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 /* no repeat, return original string */
7262 Py_INCREF(str);
7263 return (PyObject*) str;
7264 }
Tim Peters8f422462000-09-09 06:13:41 +00007265
7266 /* ensure # of chars needed doesn't overflow int and # of bytes
7267 * needed doesn't overflow size_t
7268 */
7269 nchars = len * str->length;
7270 if (len && nchars / len != str->length) {
7271 PyErr_SetString(PyExc_OverflowError,
7272 "repeated string is too long");
7273 return NULL;
7274 }
7275 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7276 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7277 PyErr_SetString(PyExc_OverflowError,
7278 "repeated string is too long");
7279 return NULL;
7280 }
7281 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 if (!u)
7283 return NULL;
7284
7285 p = u->str;
7286
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007287 if (str->length == 1 && len > 0) {
7288 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007289 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007290 Py_ssize_t done = 0; /* number of characters copied this far */
7291 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007292 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007293 done = str->length;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007294 }
7295 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007296 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007297 Py_UNICODE_COPY(p+done, p, n);
7298 done += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007299 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302 return (PyObject*) u;
7303}
7304
7305PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007306 PyObject *subobj,
7307 PyObject *replobj,
7308 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309{
7310 PyObject *self;
7311 PyObject *str1;
7312 PyObject *str2;
7313 PyObject *result;
7314
7315 self = PyUnicode_FromObject(obj);
7316 if (self == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 str1 = PyUnicode_FromObject(subobj);
7319 if (str1 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007320 Py_DECREF(self);
7321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 }
7323 str2 = PyUnicode_FromObject(replobj);
7324 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007325 Py_DECREF(self);
7326 Py_DECREF(str1);
7327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 }
Tim Petersced69f82003-09-16 20:30:58 +00007329 result = replace((PyUnicodeObject *)self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007330 (PyUnicodeObject *)str1,
7331 (PyUnicodeObject *)str2,
7332 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 Py_DECREF(self);
7334 Py_DECREF(str1);
7335 Py_DECREF(str2);
7336 return result;
7337}
7338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007339PyDoc_STRVAR(replace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007340 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341\n\
7342Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007343old replaced by new. If the optional argument count is\n\
7344given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
7346static PyObject*
7347unicode_replace(PyUnicodeObject *self, PyObject *args)
7348{
7349 PyUnicodeObject *str1;
7350 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007351 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 PyObject *result;
7353
Martin v. Löwis18e16552006-02-15 17:27:45 +00007354 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 return NULL;
7356 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7357 if (str1 == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007360 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007361 Py_DECREF(str1);
7362 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
7365 result = replace(self, str1, str2, maxcount);
7366
7367 Py_DECREF(str1);
7368 Py_DECREF(str2);
7369 return result;
7370}
7371
7372static
7373PyObject *unicode_repr(PyObject *unicode)
7374{
7375 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007376 PyUnicode_GET_SIZE(unicode),
7377 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378}
7379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007381 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382\n\
7383Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007384such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385arguments start and end are interpreted as in slice notation.\n\
7386\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388
7389static PyObject *
7390unicode_rfind(PyUnicodeObject *self, PyObject *args)
7391{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007392 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007393 Py_ssize_t start;
7394 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007395 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
Facundo Batista57d56692007-11-16 18:04:14 +00007397 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007400 result = stringlib_rfind_slice(
7401 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7402 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7403 start, end
7404 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007407
7408 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409}
7410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007411PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007412 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007414Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
7416static PyObject *
7417unicode_rindex(PyUnicodeObject *self, PyObject *args)
7418{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007419 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007420 Py_ssize_t start;
7421 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007422 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423
Facundo Batista57d56692007-11-16 18:04:14 +00007424 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007427 result = stringlib_rfind_slice(
7428 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7429 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7430 start, end
7431 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432
7433 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007434
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 if (result < 0) {
7436 PyErr_SetString(PyExc_ValueError, "substring not found");
7437 return NULL;
7438 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007439 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440}
7441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007442PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007443 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007445Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007446done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
7448static PyObject *
7449unicode_rjust(PyUnicodeObject *self, PyObject *args)
7450{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007451 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007452 Py_UNICODE fillchar = ' ';
7453
Martin v. Löwis412fb672006-04-13 06:34:32 +00007454 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 return NULL;
7456
Tim Peters7a29bd52001-09-12 03:03:31 +00007457 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 Py_INCREF(self);
7459 return (PyObject*) self;
7460 }
7461
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007462 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463}
7464
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007466unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467{
7468 /* standard clamping */
7469 if (start < 0)
7470 start = 0;
7471 if (end < 0)
7472 end = 0;
7473 if (end > self->length)
7474 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007475 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 /* full slice, return original string */
7477 Py_INCREF(self);
7478 return (PyObject*) self;
7479 }
7480 if (start > end)
7481 start = end;
7482 /* copy slice */
7483 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007484 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485}
7486
7487PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007488 PyObject *sep,
7489 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490{
7491 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007492
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 s = PyUnicode_FromObject(s);
7494 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007495 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007496 if (sep != NULL) {
7497 sep = PyUnicode_FromObject(sep);
7498 if (sep == NULL) {
7499 Py_DECREF(s);
7500 return NULL;
7501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 }
7503
7504 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7505
7506 Py_DECREF(s);
7507 Py_XDECREF(sep);
7508 return result;
7509}
7510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007511PyDoc_STRVAR(split__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007512 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513\n\
7514Return a list of the words in S, using sep as the\n\
7515delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007516splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007517whitespace string is a separator and empty strings are\n\
7518removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519
7520static PyObject*
7521unicode_split(PyUnicodeObject *self, PyObject *args)
7522{
7523 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007524 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
Martin v. Löwis18e16552006-02-15 17:27:45 +00007526 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 return NULL;
7528
7529 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007530 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007532 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007534 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535}
7536
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007537PyObject *
7538PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7539{
7540 PyObject* str_obj;
7541 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007542 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007543
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007544 str_obj = PyUnicode_FromObject(str_in);
7545 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007546 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007547 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007548 if (!sep_obj) {
7549 Py_DECREF(str_obj);
7550 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007551 }
7552
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007553 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007554 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7555 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7556 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007557
Fredrik Lundhb9479482006-05-26 17:22:38 +00007558 Py_DECREF(sep_obj);
7559 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007560
7561 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007562}
7563
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007564
7565PyObject *
7566PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7567{
7568 PyObject* str_obj;
7569 PyObject* sep_obj;
7570 PyObject* out;
7571
7572 str_obj = PyUnicode_FromObject(str_in);
7573 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007574 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007575 sep_obj = PyUnicode_FromObject(sep_in);
7576 if (!sep_obj) {
7577 Py_DECREF(str_obj);
7578 return NULL;
7579 }
7580
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007581 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007582 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7583 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7584 );
7585
7586 Py_DECREF(sep_obj);
7587 Py_DECREF(str_obj);
7588
7589 return out;
7590}
7591
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007592PyDoc_STRVAR(partition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007593 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007594\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007595Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007596the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007597found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007598
7599static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007600unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007601{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007602 return PyUnicode_Partition((PyObject *)self, separator);
7603}
7604
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007605PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007606 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007607\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007608Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007609the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007610separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007611
7612static PyObject*
7613unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7614{
7615 return PyUnicode_RPartition((PyObject *)self, separator);
7616}
7617
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007618PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007619 PyObject *sep,
7620 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621{
7622 PyObject *result;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007623
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007624 s = PyUnicode_FromObject(s);
7625 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007626 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007627 if (sep != NULL) {
7628 sep = PyUnicode_FromObject(sep);
7629 if (sep == NULL) {
7630 Py_DECREF(s);
7631 return NULL;
7632 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007633 }
7634
7635 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7636
7637 Py_DECREF(s);
7638 Py_XDECREF(sep);
7639 return result;
7640}
7641
7642PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007643 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007644\n\
7645Return a list of the words in S, using sep as the\n\
7646delimiter string, starting at the end of the string and\n\
7647working to the front. If maxsplit is given, at most maxsplit\n\
7648splits are done. If sep is not specified, any whitespace string\n\
7649is a separator.");
7650
7651static PyObject*
7652unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7653{
7654 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007655 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007656
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007658 return NULL;
7659
7660 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007661 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007662 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007663 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007664 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007665 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007666}
7667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007668PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007669 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670\n\
7671Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007672Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674
7675static PyObject*
7676unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7677{
Guido van Rossum86662912000-04-11 15:38:46 +00007678 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679
Guido van Rossum86662912000-04-11 15:38:46 +00007680 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 return NULL;
7682
Guido van Rossum86662912000-04-11 15:38:46 +00007683 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684}
7685
7686static
7687PyObject *unicode_str(PyUnicodeObject *self)
7688{
Fred Drakee4315f52000-05-09 19:53:39 +00007689 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690}
7691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007692PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007693 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694\n\
7695Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697
7698static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007699unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 return fixup(self, fixswapcase);
7702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(translate__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007705 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
7707Return a copy of the string S, where all characters have been mapped\n\
7708through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007709Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7710Unmapped characters are left untouched. Characters mapped to None\n\
7711are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
7713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007714unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715{
Tim Petersced69f82003-09-16 20:30:58 +00007716 return PyUnicode_TranslateCharmap(self->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007717 self->length,
7718 table,
7719 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720}
7721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007722PyDoc_STRVAR(upper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007723 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
7727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007728unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 return fixup(self, fixupper);
7731}
7732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007733PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007734 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735\n\
Georg Brandl98064072008-09-09 19:26:00 +00007736Pad a numeric string S with zeros on the left, to fill a field\n\
7737of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
7739static PyObject *
7740unicode_zfill(PyUnicodeObject *self, PyObject *args)
7741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007742 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 PyUnicodeObject *u;
7744
Martin v. Löwis18e16552006-02-15 17:27:45 +00007745 Py_ssize_t width;
7746 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 return NULL;
7748
7749 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007750 if (PyUnicode_CheckExact(self)) {
7751 Py_INCREF(self);
7752 return (PyObject*) self;
7753 }
7754 else
7755 return PyUnicode_FromUnicode(
7756 PyUnicode_AS_UNICODE(self),
7757 PyUnicode_GET_SIZE(self)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007758 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 }
7760
7761 fill = width - self->length;
7762
7763 u = pad(self, fill, 0, '0');
7764
Walter Dörwald068325e2002-04-15 13:36:47 +00007765 if (u == NULL)
7766 return NULL;
7767
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 if (u->str[fill] == '+' || u->str[fill] == '-') {
7769 /* move sign to beginning of string */
7770 u->str[0] = u->str[fill];
7771 u->str[fill] = '0';
7772 }
7773
7774 return (PyObject*) u;
7775}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777#if 0
7778static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007779free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007781 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782}
7783#endif
7784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007785PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007786 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007788Return True if S starts with the specified prefix, False otherwise.\n\
7789With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007790With optional end, stop comparing S at that position.\n\
7791prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793static PyObject *
7794unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007795 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
Georg Brandl24250812006-06-09 18:45:48 +00007797 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007800 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007801 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
Georg Brandl24250812006-06-09 18:45:48 +00007803 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007804 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7805 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007806 if (PyTuple_Check(subobj)) {
7807 Py_ssize_t i;
7808 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7809 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007810 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007811 if (substring == NULL)
7812 return NULL;
7813 result = tailmatch(self, substring, start, end, -1);
7814 Py_DECREF(substring);
7815 if (result) {
7816 Py_RETURN_TRUE;
7817 }
7818 }
7819 /* nothing matched */
7820 Py_RETURN_FALSE;
7821 }
7822 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007824 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007825 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007827 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828}
7829
7830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007831PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007832 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007834Return True if S ends with the specified suffix, False otherwise.\n\
7835With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007836With optional end, stop comparing S at that position.\n\
7837suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838
7839static PyObject *
7840unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007841 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842{
Georg Brandl24250812006-06-09 18:45:48 +00007843 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007845 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007846 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007847 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
Georg Brandl24250812006-06-09 18:45:48 +00007849 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007850 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7851 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007852 if (PyTuple_Check(subobj)) {
7853 Py_ssize_t i;
7854 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7855 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007856 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007857 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007858 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007859 result = tailmatch(self, substring, start, end, +1);
7860 Py_DECREF(substring);
7861 if (result) {
7862 Py_RETURN_TRUE;
7863 }
7864 }
7865 Py_RETURN_FALSE;
7866 }
7867 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
Georg Brandl24250812006-06-09 18:45:48 +00007871 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007873 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874}
7875
7876
Eric Smitha9f7d622008-02-17 19:46:49 +00007877/* Implements do_string_format, which is unicode because of stringlib */
7878#include "stringlib/string_format.h"
7879
7880PyDoc_STRVAR(format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007881 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007882\n\
7883");
7884
Eric Smithdc13b792008-05-30 18:10:04 +00007885static PyObject *
7886unicode__format__(PyObject *self, PyObject *args)
7887{
7888 PyObject *format_spec;
7889 PyObject *result = NULL;
7890 PyObject *tmp = NULL;
7891
7892 /* If 2.x, convert format_spec to the same type as value */
7893 /* This is to allow things like u''.format('') */
7894 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7895 goto done;
7896 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7897 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007898 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007899 goto done;
7900 }
7901 tmp = PyObject_Unicode(format_spec);
7902 if (tmp == NULL)
7903 goto done;
7904 format_spec = tmp;
7905
7906 result = _PyUnicode_FormatAdvanced(self,
7907 PyUnicode_AS_UNICODE(format_spec),
7908 PyUnicode_GET_SIZE(format_spec));
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007909 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007910 Py_XDECREF(tmp);
7911 return result;
7912}
7913
Eric Smitha9f7d622008-02-17 19:46:49 +00007914PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007915 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007916\n\
7917");
7918
Robert Schuppenies901c9972008-06-10 10:10:31 +00007919static PyObject *
7920unicode__sizeof__(PyUnicodeObject *v)
7921{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007922 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7923 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007924}
7925
7926PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007927 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007928\n\
7929");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007930
7931static PyObject *
7932unicode_getnewargs(PyUnicodeObject *v)
7933{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007934 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007935}
7936
7937
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938static PyMethodDef unicode_methods[] = {
7939
7940 /* Order is according to common usage: often used methods should
7941 appear first, since lookup is done sequentially. */
7942
Georg Brandlecdc0a92006-03-30 12:19:07 +00007943 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007944 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7945 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007946 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007947 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7948 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7949 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7950 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7951 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7952 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7953 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007954 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007955 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7956 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7957 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007958 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007959 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007960/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7961 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7962 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7963 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007964 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007965 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007966 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007967 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007968 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7969 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7970 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7971 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7972 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7973 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7974 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7975 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7976 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7977 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7978 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7979 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7980 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7981 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007983 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7984 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7985 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7986 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007987 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007988#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007989 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990#endif
7991
7992#if 0
7993 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007994 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995#endif
7996
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007997 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 {NULL, NULL}
7999};
8000
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008001static PyObject *
8002unicode_mod(PyObject *v, PyObject *w)
8003{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008004 if (!PyUnicode_Check(v)) {
8005 Py_INCREF(Py_NotImplemented);
8006 return Py_NotImplemented;
8007 }
8008 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008009}
8010
8011static PyNumberMethods unicode_as_number = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008012 0, /*nb_add*/
8013 0, /*nb_subtract*/
8014 0, /*nb_multiply*/
8015 0, /*nb_divide*/
8016 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008017};
8018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008020 (lenfunc) unicode_length, /* sq_length */
8021 PyUnicode_Concat, /* sq_concat */
8022 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8023 (ssizeargfunc) unicode_getitem, /* sq_item */
8024 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8025 0, /* sq_ass_item */
8026 0, /* sq_ass_slice */
8027 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028};
8029
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008030static PyObject*
8031unicode_subscript(PyUnicodeObject* self, PyObject* item)
8032{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008033 if (PyIndex_Check(item)) {
8034 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008035 if (i == -1 && PyErr_Occurred())
8036 return NULL;
8037 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008038 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008039 return unicode_getitem(self, i);
8040 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008042 Py_UNICODE* source_buf;
8043 Py_UNICODE* result_buf;
8044 PyObject* result;
8045
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008046 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008047 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008048 return NULL;
8049 }
8050
8051 if (slicelength <= 0) {
8052 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008053 } else if (start == 0 && step == 1 && slicelength == self->length &&
8054 PyUnicode_CheckExact(self)) {
8055 Py_INCREF(self);
8056 return (PyObject *)self;
8057 } else if (step == 1) {
8058 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008059 } else {
8060 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008061 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8062 sizeof(Py_UNICODE));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008063
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008064 if (result_buf == NULL)
8065 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008066
8067 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8068 result_buf[i] = source_buf[cur];
8069 }
Tim Petersced69f82003-09-16 20:30:58 +00008070
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008071 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008072 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008073 return result;
8074 }
8075 } else {
8076 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8077 return NULL;
8078 }
8079}
8080
8081static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008082 (lenfunc)unicode_length, /* mp_length */
8083 (binaryfunc)unicode_subscript, /* mp_subscript */
8084 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008085};
8086
Martin v. Löwis18e16552006-02-15 17:27:45 +00008087static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008089 Py_ssize_t index,
8090 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
8092 if (index != 0) {
8093 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008094 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 return -1;
8096 }
8097 *ptr = (void *) self->str;
8098 return PyUnicode_GET_DATA_SIZE(self);
8099}
8100
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101static Py_ssize_t
8102unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008103 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
8105 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008106 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 return -1;
8108}
8109
8110static int
8111unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008112 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113{
8114 if (lenp)
8115 *lenp = PyUnicode_GET_DATA_SIZE(self);
8116 return 1;
8117}
8118
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008119static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008121 Py_ssize_t index,
8122 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
8124 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008125
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 if (index != 0) {
8127 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008128 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return -1;
8130 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008131 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008133 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008134 *ptr = (void *) PyString_AS_STRING(str);
8135 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136}
8137
8138/* Helpers for PyUnicode_Format() */
8139
8140static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008141getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 if (argidx < arglen) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008145 (*p_argidx)++;
8146 if (arglen < 0)
8147 return args;
8148 else
8149 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 }
8151 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008152 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return NULL;
8154}
8155
8156#define F_LJUST (1<<0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008157#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158#define F_BLANK (1<<2)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008159#define F_ALT (1<<3)
8160#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161
Martin v. Löwis18e16552006-02-15 17:27:45 +00008162static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008163strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008165 register Py_ssize_t i;
8166 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 for (i = len - 1; i >= 0; i--)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008168 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 return len;
8171}
8172
Neal Norwitzfc76d632006-01-10 06:03:13 +00008173static int
8174doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8175{
Tim Peters15231542006-02-16 01:08:01 +00008176 Py_ssize_t result;
8177
Neal Norwitzfc76d632006-01-10 06:03:13 +00008178 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008179 result = strtounicode(buffer, (char *)buffer);
8180 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008181}
8182
8183static int
8184longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8185{
Tim Peters15231542006-02-16 01:08:01 +00008186 Py_ssize_t result;
8187
Neal Norwitzfc76d632006-01-10 06:03:13 +00008188 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008189 result = strtounicode(buffer, (char *)buffer);
8190 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008191}
8192
Guido van Rossum078151d2002-08-11 04:24:12 +00008193/* XXX To save some code duplication, formatfloat/long/int could have been
8194 shared with stringobject.c, converting from 8-bit to Unicode after the
8195 formatting is done. */
8196
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197static int
8198formatfloat(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008199 size_t buflen,
8200 int flags,
8201 int prec,
8202 int type,
8203 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008205 /* fmt = '%#.' + `prec` + `type`
8206 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 char fmt[20];
8208 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008209
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 x = PyFloat_AsDouble(v);
8211 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008212 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 if (prec < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008214 prec = 6;
Mark Dickinson75be68b2009-08-28 20:57:42 +00008215#if SIZEOF_INT > 4
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008216 /* make sure that the decimal representation of precision really does
8217 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
Mark Dickinson75be68b2009-08-28 20:57:42 +00008218 if (prec > 0x7fffffff) {
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008219 PyErr_SetString(PyExc_OverflowError,
8220 "outrageously large precision "
8221 "for formatted float");
8222 return -1;
8223 }
Mark Dickinson75be68b2009-08-28 20:57:42 +00008224#endif
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008225
Mark Dickinsona30f3492009-03-29 15:06:29 +00008226 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008227 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008228 /* Worst case length calc to ensure no buffer overrun:
8229
8230 'g' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008231 fmt = %#.<prec>g
8232 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8233 for any double rep.)
8234 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008235
8236 'f' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008237 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8238 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008239
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008240 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008241 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008242
8243 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008244 if (((type == 'g' || type == 'G') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008245 buflen <= (size_t)10 + (size_t)prec) ||
8246 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8247 PyErr_SetString(PyExc_OverflowError,
8248 "formatted float is too long (precision too large?)");
8249 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008251 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008252 (flags&F_ALT) ? "#" : "",
8253 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008254 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255}
8256
Tim Peters38fd5b62000-09-21 05:43:11 +00008257static PyObject*
8258formatlong(PyObject *val, int flags, int prec, int type)
8259{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008260 char *buf;
8261 int i, len;
8262 PyObject *str; /* temporary string object. */
8263 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008264
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008265 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8266 if (!str)
8267 return NULL;
8268 result = _PyUnicode_New(len);
8269 if (!result) {
8270 Py_DECREF(str);
8271 return NULL;
8272 }
8273 for (i = 0; i < len; i++)
8274 result->str[i] = buf[i];
8275 result->str[len] = 0;
8276 Py_DECREF(str);
8277 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008278}
8279
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280static int
8281formatint(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008282 size_t buflen,
8283 int flags,
8284 int prec,
8285 int type,
8286 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008288 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008289 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8290 * + 1 + 1
8291 * = 24
8292 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008293 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008294 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 long x;
8296
8297 x = PyInt_AsLong(v);
8298 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008299 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008300 if (x < 0 && type == 'u') {
8301 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008302 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008303 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8304 sign = "-";
8305 else
8306 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008308 prec = 1;
8309
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008310 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8311 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008312 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008313 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008314 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008315 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008316 return -1;
8317 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008318
8319 if ((flags & F_ALT) &&
8320 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008321 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008322 * of issues that cause pain:
8323 * - when 0 is being converted, the C standard leaves off
8324 * the '0x' or '0X', which is inconsistent with other
8325 * %#x/%#X conversions and inconsistent with Python's
8326 * hex() function
8327 * - there are platforms that violate the standard and
8328 * convert 0 with the '0x' or '0X'
8329 * (Metrowerks, Compaq Tru64)
8330 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008331 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008332 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008333 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008334 * We can achieve the desired consistency by inserting our
8335 * own '0x' or '0X' prefix, and substituting %x/%X in place
8336 * of %#x/%#X.
8337 *
8338 * Note that this is the same approach as used in
8339 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008340 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008341 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8342 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008343 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008344 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008345 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8346 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008347 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008348 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008349 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008350 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008351 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008352 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353}
8354
8355static int
8356formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008357 size_t buflen,
8358 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008360 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008361 if (PyUnicode_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008362 if (PyUnicode_GET_SIZE(v) != 1)
8363 goto onError;
8364 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008367 else if (PyString_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008368 if (PyString_GET_SIZE(v) != 1)
8369 goto onError;
8370 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372
8373 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008374 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 long x;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008376 x = PyInt_AsLong(v);
8377 if (x == -1 && PyErr_Occurred())
8378 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008379#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008380 if (x < 0 || x > 0x10ffff) {
8381 PyErr_SetString(PyExc_OverflowError,
8382 "%c arg not in range(0x110000) "
8383 "(wide Python build)");
8384 return -1;
8385 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008386#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008387 if (x < 0 || x > 0xffff) {
8388 PyErr_SetString(PyExc_OverflowError,
8389 "%c arg not in range(0x10000) "
8390 "(narrow Python build)");
8391 return -1;
8392 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008393#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008394 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
8396 buf[1] = '\0';
8397 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008398
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008399 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008400 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008401 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008402 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403}
8404
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008405/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8406
8407 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8408 chars are formatted. XXX This is a magic number. Each formatting
8409 routine does bounds checking to ensure no overflow, but a better
8410 solution may be to malloc a buffer of appropriate size for each
8411 format. For now, the current solution is sufficient.
8412*/
8413#define FORMATBUFLEN (size_t)120
8414
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008416 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417{
8418 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 int args_owned = 0;
8421 PyUnicodeObject *result = NULL;
8422 PyObject *dict = NULL;
8423 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008424
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 if (format == NULL || args == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008426 PyErr_BadInternalCall();
8427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 }
8429 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008430 if (uformat == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 fmt = PyUnicode_AS_UNICODE(uformat);
8433 fmtcnt = PyUnicode_GET_SIZE(uformat);
8434
8435 reslen = rescnt = fmtcnt + 100;
8436 result = _PyUnicode_New(reslen);
8437 if (result == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008438 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 res = PyUnicode_AS_UNICODE(result);
8440
8441 if (PyTuple_Check(args)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008442 arglen = PyTuple_Size(args);
8443 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 }
8445 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008446 arglen = -1;
8447 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 }
Christian Heimese93237d2007-12-19 02:37:44 +00008449 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008450 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008451 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452
8453 while (--fmtcnt >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008454 if (*fmt != '%') {
8455 if (--rescnt < 0) {
8456 rescnt = fmtcnt + 100;
8457 reslen += rescnt;
8458 if (_PyUnicode_Resize(&result, reslen) < 0)
8459 goto onError;
8460 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8461 --rescnt;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008462 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008463 *res++ = *fmt++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008464 }
8465 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008466 /* Got a format specifier */
8467 int flags = 0;
8468 Py_ssize_t width = -1;
8469 int prec = -1;
8470 Py_UNICODE c = '\0';
8471 Py_UNICODE fill;
8472 int isnumok;
8473 PyObject *v = NULL;
8474 PyObject *temp = NULL;
8475 Py_UNICODE *pbuf;
8476 Py_UNICODE sign;
8477 Py_ssize_t len;
8478 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8479
8480 fmt++;
8481 if (*fmt == '(') {
8482 Py_UNICODE *keystart;
8483 Py_ssize_t keylen;
8484 PyObject *key;
8485 int pcount = 1;
8486
8487 if (dict == NULL) {
8488 PyErr_SetString(PyExc_TypeError,
8489 "format requires a mapping");
8490 goto onError;
8491 }
8492 ++fmt;
8493 --fmtcnt;
8494 keystart = fmt;
8495 /* Skip over balanced parentheses */
8496 while (pcount > 0 && --fmtcnt >= 0) {
8497 if (*fmt == ')')
8498 --pcount;
8499 else if (*fmt == '(')
8500 ++pcount;
8501 fmt++;
8502 }
8503 keylen = fmt - keystart - 1;
8504 if (fmtcnt < 0 || pcount > 0) {
8505 PyErr_SetString(PyExc_ValueError,
8506 "incomplete format key");
8507 goto onError;
8508 }
8509#if 0
8510 /* keys are converted to strings using UTF-8 and
8511 then looked up since Python uses strings to hold
8512 variables names etc. in its namespaces and we
8513 wouldn't want to break common idioms. */
8514 key = PyUnicode_EncodeUTF8(keystart,
8515 keylen,
8516 NULL);
8517#else
8518 key = PyUnicode_FromUnicode(keystart, keylen);
8519#endif
8520 if (key == NULL)
8521 goto onError;
8522 if (args_owned) {
8523 Py_DECREF(args);
8524 args_owned = 0;
8525 }
8526 args = PyObject_GetItem(dict, key);
8527 Py_DECREF(key);
8528 if (args == NULL) {
8529 goto onError;
8530 }
8531 args_owned = 1;
8532 arglen = -1;
8533 argidx = -2;
8534 }
8535 while (--fmtcnt >= 0) {
8536 switch (c = *fmt++) {
8537 case '-': flags |= F_LJUST; continue;
8538 case '+': flags |= F_SIGN; continue;
8539 case ' ': flags |= F_BLANK; continue;
8540 case '#': flags |= F_ALT; continue;
8541 case '0': flags |= F_ZERO; continue;
8542 }
8543 break;
8544 }
8545 if (c == '*') {
8546 v = getnextarg(args, arglen, &argidx);
8547 if (v == NULL)
8548 goto onError;
8549 if (!PyInt_Check(v)) {
8550 PyErr_SetString(PyExc_TypeError,
8551 "* wants int");
8552 goto onError;
8553 }
8554 width = PyInt_AsLong(v);
8555 if (width < 0) {
8556 flags |= F_LJUST;
8557 width = -width;
8558 }
8559 if (--fmtcnt >= 0)
8560 c = *fmt++;
8561 }
8562 else if (c >= '0' && c <= '9') {
8563 width = c - '0';
8564 while (--fmtcnt >= 0) {
8565 c = *fmt++;
8566 if (c < '0' || c > '9')
8567 break;
8568 if ((width*10) / 10 != width) {
8569 PyErr_SetString(PyExc_ValueError,
8570 "width too big");
8571 goto onError;
8572 }
8573 width = width*10 + (c - '0');
8574 }
8575 }
8576 if (c == '.') {
8577 prec = 0;
8578 if (--fmtcnt >= 0)
8579 c = *fmt++;
8580 if (c == '*') {
8581 v = getnextarg(args, arglen, &argidx);
8582 if (v == NULL)
8583 goto onError;
8584 if (!PyInt_Check(v)) {
8585 PyErr_SetString(PyExc_TypeError,
8586 "* wants int");
8587 goto onError;
8588 }
8589 prec = PyInt_AsLong(v);
8590 if (prec < 0)
8591 prec = 0;
8592 if (--fmtcnt >= 0)
8593 c = *fmt++;
8594 }
8595 else if (c >= '0' && c <= '9') {
8596 prec = c - '0';
8597 while (--fmtcnt >= 0) {
8598 c = Py_CHARMASK(*fmt++);
8599 if (c < '0' || c > '9')
8600 break;
8601 if ((prec*10) / 10 != prec) {
8602 PyErr_SetString(PyExc_ValueError,
8603 "prec too big");
8604 goto onError;
8605 }
8606 prec = prec*10 + (c - '0');
8607 }
8608 }
8609 } /* prec */
8610 if (fmtcnt >= 0) {
8611 if (c == 'h' || c == 'l' || c == 'L') {
8612 if (--fmtcnt >= 0)
8613 c = *fmt++;
8614 }
8615 }
8616 if (fmtcnt < 0) {
8617 PyErr_SetString(PyExc_ValueError,
8618 "incomplete format");
8619 goto onError;
8620 }
8621 if (c != '%') {
8622 v = getnextarg(args, arglen, &argidx);
8623 if (v == NULL)
8624 goto onError;
8625 }
8626 sign = 0;
8627 fill = ' ';
8628 switch (c) {
8629
8630 case '%':
8631 pbuf = formatbuf;
8632 /* presume that buffer length is at least 1 */
8633 pbuf[0] = '%';
8634 len = 1;
8635 break;
8636
8637 case 's':
8638 case 'r':
8639 if (PyUnicode_Check(v) && c == 's') {
8640 temp = v;
8641 Py_INCREF(temp);
8642 }
8643 else {
8644 PyObject *unicode;
8645 if (c == 's')
8646 temp = PyObject_Unicode(v);
8647 else
8648 temp = PyObject_Repr(v);
8649 if (temp == NULL)
8650 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008651 if (PyUnicode_Check(temp))
8652 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008653 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008654 /* convert to string to Unicode */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008655 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8656 PyString_GET_SIZE(temp),
8657 NULL,
8658 "strict");
8659 Py_DECREF(temp);
8660 temp = unicode;
8661 if (temp == NULL)
8662 goto onError;
8663 }
8664 else {
8665 Py_DECREF(temp);
8666 PyErr_SetString(PyExc_TypeError,
8667 "%s argument has non-string str()");
8668 goto onError;
8669 }
8670 }
8671 pbuf = PyUnicode_AS_UNICODE(temp);
8672 len = PyUnicode_GET_SIZE(temp);
8673 if (prec >= 0 && len > prec)
8674 len = prec;
8675 break;
8676
8677 case 'i':
8678 case 'd':
8679 case 'u':
8680 case 'o':
8681 case 'x':
8682 case 'X':
8683 if (c == 'i')
8684 c = 'd';
8685 isnumok = 0;
8686 if (PyNumber_Check(v)) {
8687 PyObject *iobj=NULL;
8688
8689 if (PyInt_Check(v) || (PyLong_Check(v))) {
8690 iobj = v;
8691 Py_INCREF(iobj);
8692 }
8693 else {
8694 iobj = PyNumber_Int(v);
8695 if (iobj==NULL) iobj = PyNumber_Long(v);
8696 }
8697 if (iobj!=NULL) {
8698 if (PyInt_Check(iobj)) {
8699 isnumok = 1;
8700 pbuf = formatbuf;
8701 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8702 flags, prec, c, iobj);
8703 Py_DECREF(iobj);
8704 if (len < 0)
8705 goto onError;
8706 sign = 1;
8707 }
8708 else if (PyLong_Check(iobj)) {
8709 isnumok = 1;
8710 temp = formatlong(iobj, flags, prec, c);
8711 Py_DECREF(iobj);
8712 if (!temp)
8713 goto onError;
8714 pbuf = PyUnicode_AS_UNICODE(temp);
8715 len = PyUnicode_GET_SIZE(temp);
8716 sign = 1;
8717 }
8718 else {
8719 Py_DECREF(iobj);
8720 }
8721 }
8722 }
8723 if (!isnumok) {
8724 PyErr_Format(PyExc_TypeError,
8725 "%%%c format: a number is required, "
8726 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8727 goto onError;
8728 }
8729 if (flags & F_ZERO)
8730 fill = '0';
8731 break;
8732
8733 case 'e':
8734 case 'E':
8735 case 'f':
8736 case 'F':
8737 case 'g':
8738 case 'G':
8739 if (c == 'F')
8740 c = 'f';
8741 pbuf = formatbuf;
8742 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8743 flags, prec, c, v);
8744 if (len < 0)
8745 goto onError;
8746 sign = 1;
8747 if (flags & F_ZERO)
8748 fill = '0';
8749 break;
8750
8751 case 'c':
8752 pbuf = formatbuf;
8753 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8754 if (len < 0)
8755 goto onError;
8756 break;
8757
8758 default:
8759 PyErr_Format(PyExc_ValueError,
8760 "unsupported format character '%c' (0x%x) "
8761 "at index %zd",
8762 (31<=c && c<=126) ? (char)c : '?',
8763 (int)c,
8764 (Py_ssize_t)(fmt - 1 -
8765 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008766 goto onError;
8767 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008768 if (sign) {
8769 if (*pbuf == '-' || *pbuf == '+') {
8770 sign = *pbuf++;
8771 len--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008772 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008773 else if (flags & F_SIGN)
8774 sign = '+';
8775 else if (flags & F_BLANK)
8776 sign = ' ';
8777 else
8778 sign = 0;
8779 }
8780 if (width < len)
8781 width = len;
8782 if (rescnt - (sign != 0) < width) {
8783 reslen -= rescnt;
8784 rescnt = width + fmtcnt + 100;
8785 reslen += rescnt;
8786 if (reslen < 0) {
8787 Py_XDECREF(temp);
8788 PyErr_NoMemory();
8789 goto onError;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008790 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008791 if (_PyUnicode_Resize(&result, reslen) < 0) {
8792 Py_XDECREF(temp);
8793 goto onError;
8794 }
8795 res = PyUnicode_AS_UNICODE(result)
8796 + reslen - rescnt;
8797 }
8798 if (sign) {
8799 if (fill != ' ')
8800 *res++ = sign;
8801 rescnt--;
8802 if (width > len)
8803 width--;
8804 }
8805 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8806 assert(pbuf[0] == '0');
8807 assert(pbuf[1] == c);
8808 if (fill != ' ') {
8809 *res++ = *pbuf++;
8810 *res++ = *pbuf++;
8811 }
8812 rescnt -= 2;
8813 width -= 2;
8814 if (width < 0)
8815 width = 0;
8816 len -= 2;
8817 }
8818 if (width > len && !(flags & F_LJUST)) {
8819 do {
8820 --rescnt;
8821 *res++ = fill;
8822 } while (--width > len);
8823 }
8824 if (fill == ' ') {
8825 if (sign)
8826 *res++ = sign;
8827 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8828 assert(pbuf[0] == '0');
8829 assert(pbuf[1] == c);
8830 *res++ = *pbuf++;
8831 *res++ = *pbuf++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008832 }
8833 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008834 Py_UNICODE_COPY(res, pbuf, len);
8835 res += len;
8836 rescnt -= len;
8837 while (--width >= len) {
8838 --rescnt;
8839 *res++ = ' ';
8840 }
8841 if (dict && (argidx < arglen) && c != '%') {
8842 PyErr_SetString(PyExc_TypeError,
8843 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008844 Py_XDECREF(temp);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008845 goto onError;
8846 }
8847 Py_XDECREF(temp);
8848 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 } /* until end */
8850 if (argidx < arglen && !dict) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008851 PyErr_SetString(PyExc_TypeError,
8852 "not all arguments converted during string formatting");
8853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 }
8855
Thomas Woutersa96affe2006-03-12 00:29:36 +00008856 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008857 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008859 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 }
8861 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 return (PyObject *)result;
8863
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008864 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 Py_XDECREF(result);
8866 Py_DECREF(uformat);
8867 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008868 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 }
8870 return NULL;
8871}
8872
8873static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008874 (readbufferproc) unicode_buffer_getreadbuf,
8875 (writebufferproc) unicode_buffer_getwritebuf,
8876 (segcountproc) unicode_buffer_getsegcount,
8877 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878};
8879
Jeremy Hylton938ace62002-07-17 16:30:39 +00008880static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008881unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8882
Tim Peters6d6c1a32001-08-02 04:15:00 +00008883static PyObject *
8884unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8885{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008886 PyObject *x = NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008887 static char *kwlist[] = {"string", "encoding", "errors", 0};
8888 char *encoding = NULL;
8889 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008890
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008891 if (type != &PyUnicode_Type)
8892 return unicode_subtype_new(type, args, kwds);
8893 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008894 kwlist, &x, &encoding, &errors))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008895 return NULL;
8896 if (x == NULL)
8897 return (PyObject *)_PyUnicode_New(0);
8898 if (encoding == NULL && errors == NULL)
8899 return PyObject_Unicode(x);
8900 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008901 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902}
8903
Guido van Rossume023fe02001-08-30 03:12:59 +00008904static PyObject *
8905unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8906{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008907 PyUnicodeObject *tmp, *pnew;
8908 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008909
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008910 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8911 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8912 if (tmp == NULL)
8913 return NULL;
8914 assert(PyUnicode_Check(tmp));
8915 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8916 if (pnew == NULL) {
8917 Py_DECREF(tmp);
8918 return NULL;
8919 }
8920 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8921 if (pnew->str == NULL) {
8922 _Py_ForgetReference((PyObject *)pnew);
8923 PyObject_Del(pnew);
8924 Py_DECREF(tmp);
8925 return PyErr_NoMemory();
8926 }
8927 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8928 pnew->length = n;
8929 pnew->hash = tmp->hash;
8930 Py_DECREF(tmp);
8931 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008932}
8933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008934PyDoc_STRVAR(unicode_doc,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008935 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008936\n\
8937Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008938encoding defaults to the current default string encoding.\n\
8939errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008942 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008943 "unicode", /* tp_name */
8944 sizeof(PyUnicodeObject), /* tp_size */
8945 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 /* Slots */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008947 (destructor)unicode_dealloc, /* tp_dealloc */
8948 0, /* tp_print */
8949 0, /* tp_getattr */
8950 0, /* tp_setattr */
8951 0, /* tp_compare */
8952 unicode_repr, /* tp_repr */
8953 &unicode_as_number, /* tp_as_number */
8954 &unicode_as_sequence, /* tp_as_sequence */
8955 &unicode_as_mapping, /* tp_as_mapping */
8956 (hashfunc) unicode_hash, /* tp_hash*/
8957 0, /* tp_call*/
8958 (reprfunc) unicode_str, /* tp_str */
8959 PyObject_GenericGetAttr, /* tp_getattro */
8960 0, /* tp_setattro */
8961 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008962 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008963 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008964 unicode_doc, /* tp_doc */
8965 0, /* tp_traverse */
8966 0, /* tp_clear */
8967 PyUnicode_RichCompare, /* tp_richcompare */
8968 0, /* tp_weaklistoffset */
8969 0, /* tp_iter */
8970 0, /* tp_iternext */
8971 unicode_methods, /* tp_methods */
8972 0, /* tp_members */
8973 0, /* tp_getset */
8974 &PyBaseString_Type, /* tp_base */
8975 0, /* tp_dict */
8976 0, /* tp_descr_get */
8977 0, /* tp_descr_set */
8978 0, /* tp_dictoffset */
8979 0, /* tp_init */
8980 0, /* tp_alloc */
8981 unicode_new, /* tp_new */
8982 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983};
8984
8985/* Initialize the Unicode implementation */
8986
Thomas Wouters78890102000-07-22 19:25:51 +00008987void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008989 int i;
8990
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008991 /* XXX - move this array to unicodectype.c ? */
8992 Py_UNICODE linebreak[] = {
8993 0x000A, /* LINE FEED */
8994 0x000D, /* CARRIAGE RETURN */
8995 0x001C, /* FILE SEPARATOR */
8996 0x001D, /* GROUP SEPARATOR */
8997 0x001E, /* RECORD SEPARATOR */
8998 0x0085, /* NEXT LINE */
8999 0x2028, /* LINE SEPARATOR */
9000 0x2029, /* PARAGRAPH SEPARATOR */
9001 };
9002
Fred Drakee4315f52000-05-09 19:53:39 +00009003 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009004 free_list = NULL;
9005 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009007 if (!unicode_empty)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009008 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009009
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009010 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009011 for (i = 0; i < 256; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009012 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009013 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009014 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009015
9016 /* initialize the linebreak bloom filter */
9017 bloom_linebreak = make_bloom_mask(
9018 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9019 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009020
9021 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022}
9023
9024/* Finalize the Unicode implementation */
9025
Christian Heimes3b718a72008-02-14 12:47:33 +00009026int
9027PyUnicode_ClearFreeList(void)
9028{
9029 int freelist_size = numfree;
9030 PyUnicodeObject *u;
9031
9032 for (u = free_list; u != NULL;) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009033 PyUnicodeObject *v = u;
9034 u = *(PyUnicodeObject **)u;
9035 if (v->str)
9036 PyObject_DEL(v->str);
9037 Py_XDECREF(v->defenc);
9038 PyObject_Del(v);
9039 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009040 }
9041 free_list = NULL;
9042 assert(numfree == 0);
9043 return freelist_size;
9044}
9045
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046void
Thomas Wouters78890102000-07-22 19:25:51 +00009047_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009049 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009051 Py_XDECREF(unicode_empty);
9052 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009053
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009054 for (i = 0; i < 256; i++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009055 if (unicode_latin1[i]) {
9056 Py_DECREF(unicode_latin1[i]);
9057 unicode_latin1[i] = NULL;
9058 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009059 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009060 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009062
Anthony Baxterac6bd462006-04-13 02:06:09 +00009063#ifdef __cplusplus
9064}
9065#endif
9066
9067
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009068/*
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009069 Local variables:
9070 c-basic-offset: 4
9071 indent-tabs-mode: nil
9072 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009073*/