blob: 9615d432274f5f4d0f0a3b939b9e9629c6aa9c9a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes4d4f2702008-01-30 11:32:37 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes4d4f2702008-01-30 11:32:37 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Christian Heimes4d4f2702008-01-30 11:32:37 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Christian Heimes4d4f2702008-01-30 11:32:37 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Christian Heimes4d4f2702008-01-30 11:32:37 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177 return 0x10FFFF;
178#else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Christian Heimes4d4f2702008-01-30 11:32:37 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
228#define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000317 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000318 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 }
320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
337 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000369 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 unicode->str = NULL;
371 unicode->length = 0;
372 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000383 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000384 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
396 PyErr_BadInternalCall();
397 return -1;
398 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000401 PyErr_BadInternalCall();
402 return -1;
403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000415 Py_DECREF(*unicode);
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000416 *unicode = w;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417 return 0;
418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
443 }
444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000451 if (!unicode)
452 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000453 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
478 "Negative size passed to PyUnicode_FromStringAndSize");
479 return NULL;
480 }
481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
492 }
493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000497 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534{
535 PyUnicodeObject *unicode;
536
537 if (w == NULL) {
538 PyErr_BadInternalCall();
539 return NULL;
540 }
541
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
545
546 /* Copy the wchar_t data into the new object */
547#ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000549#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550 {
551 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000552 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000554 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 *u++ = *w++;
556 }
557#endif
558
559 return (PyObject *)unicode;
560}
561
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000562static void
563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
564{
565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
570 }
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
579 }
580 *fmt++ = c;
581 *fmt = '\0';
582}
583
584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585
586PyObject *
587PyUnicode_FromFormatV(const char *format, va_list vargs)
588{
589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
609
610#ifdef VA_LIST_IS_ARRAY
611 Py_MEMCPY(count, vargs, sizeof(va_list));
612#else
613#ifdef __va_copy
614 __va_copy(count, vargs);
615#else
616 count = vargs;
617#endif
618#endif
619 /* step 1: count the number of %S/%R format specifications
620 * (we call PyObject_Str()/PyObject_Repr() for these objects
621 * once during step 3 and put the result in an array) */
622 for (f = format; *f; f++) {
623 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
624 ++callcount;
625 }
626 /* step 2: allocate memory for the results of
627 * PyObject_Str()/PyObject_Repr() calls */
628 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000629 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000630 if (!callresults) {
631 PyErr_NoMemory();
632 return NULL;
633 }
634 callresult = callresults;
635 }
636 /* step 3: figure out how large a buffer we need */
637 for (f = format; *f; f++) {
638 if (*f == '%') {
639 const char* p = f;
640 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000641 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000642 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000643 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000644 ;
645
646 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
647 * they don't affect the amount of space we reserve.
648 */
649 if ((*f == 'l' || *f == 'z') &&
650 (f[1] == 'd' || f[1] == 'u'))
651 ++f;
652
653 switch (*f) {
654 case 'c':
655 (void)va_arg(count, int);
656 /* fall through... */
657 case '%':
658 n++;
659 break;
660 case 'd': case 'u': case 'i': case 'x':
661 (void) va_arg(count, int);
662 /* 20 bytes is enough to hold a 64-bit
663 integer. Decimal takes the most space.
664 This isn't enough for octal.
665 If a width is specified we need more
666 (which we allocate later). */
667 if (width < 20)
668 width = 20;
669 n += width;
670 if (abuffersize < width)
671 abuffersize = width;
672 break;
673 case 's':
674 {
675 /* UTF-8 */
676 unsigned char*s;
677 s = va_arg(count, unsigned char*);
678 while (*s) {
679 if (*s < 128) {
680 n++; s++;
681 } else if (*s < 0xc0) {
682 /* invalid UTF-8 */
683 n++; s++;
684 } else if (*s < 0xc0) {
685 n++;
686 s++; if(!*s)break;
687 s++;
688 } else if (*s < 0xe0) {
689 n++;
690 s++; if(!*s)break;
691 s++; if(!*s)break;
692 s++;
693 } else {
694 #ifdef Py_UNICODE_WIDE
695 n++;
696 #else
697 n+=2;
698 #endif
699 s++; if(!*s)break;
700 s++; if(!*s)break;
701 s++; if(!*s)break;
702 s++;
703 }
704 }
705 break;
706 }
707 case 'U':
708 {
709 PyObject *obj = va_arg(count, PyObject *);
710 assert(obj && PyUnicode_Check(obj));
711 n += PyUnicode_GET_SIZE(obj);
712 break;
713 }
714 case 'V':
715 {
716 PyObject *obj = va_arg(count, PyObject *);
717 const char *str = va_arg(count, const char *);
718 assert(obj || str);
719 assert(!obj || PyUnicode_Check(obj));
720 if (obj)
721 n += PyUnicode_GET_SIZE(obj);
722 else
723 n += strlen(str);
724 break;
725 }
726 case 'S':
727 {
728 PyObject *obj = va_arg(count, PyObject *);
729 PyObject *str;
730 assert(obj);
731 str = PyObject_Str(obj);
732 if (!str)
733 goto fail;
734 n += PyUnicode_GET_SIZE(str);
735 /* Remember the str and switch to the next slot */
736 *callresult++ = str;
737 break;
738 }
739 case 'R':
740 {
741 PyObject *obj = va_arg(count, PyObject *);
742 PyObject *repr;
743 assert(obj);
744 repr = PyObject_Repr(obj);
745 if (!repr)
746 goto fail;
747 n += PyUnicode_GET_SIZE(repr);
748 /* Remember the repr and switch to the next slot */
749 *callresult++ = repr;
750 break;
751 }
752 case 'p':
753 (void) va_arg(count, int);
754 /* maximum 64-bit pointer representation:
755 * 0xffffffffffffffff
756 * so 19 characters is enough.
757 * XXX I count 18 -- what's the extra for?
758 */
759 n += 19;
760 break;
761 default:
762 /* if we stumble upon an unknown
763 formatting code, copy the rest of
764 the format string to the output
765 string. (we cannot just skip the
766 code, since there's no way to know
767 what's in the argument list) */
768 n += strlen(p);
769 goto expand;
770 }
771 } else
772 n++;
773 }
774 expand:
775 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000776 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000777 if (!abuffer) {
778 PyErr_NoMemory();
779 goto fail;
780 }
781 realbuffer = abuffer;
782 }
783 else
784 realbuffer = buffer;
785 /* step 4: fill the buffer */
786 /* Since we've analyzed how much space we need for the worst case,
787 we don't have to resize the string.
788 There can be no errors beyond this point. */
789 string = PyUnicode_FromUnicode(NULL, n);
790 if (!string)
791 goto fail;
792
793 s = PyUnicode_AS_UNICODE(string);
794 callresult = callresults;
795
796 for (f = format; *f; f++) {
797 if (*f == '%') {
798 const char* p = f++;
799 int longflag = 0;
800 int size_tflag = 0;
801 zeropad = (*f == '0');
802 /* parse the width.precision part */
803 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000804 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000805 width = (width*10) + *f++ - '0';
806 precision = 0;
807 if (*f == '.') {
808 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000809 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000810 precision = (precision*10) + *f++ - '0';
811 }
812 /* handle the long flag, but only for %ld and %lu.
813 others can be added when necessary. */
814 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
815 longflag = 1;
816 ++f;
817 }
818 /* handle the size_t flag. */
819 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
820 size_tflag = 1;
821 ++f;
822 }
823
824 switch (*f) {
825 case 'c':
826 *s++ = va_arg(vargs, int);
827 break;
828 case 'd':
829 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
830 if (longflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, long));
832 else if (size_tflag)
833 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
834 else
835 sprintf(realbuffer, fmt, va_arg(vargs, int));
836 appendstring(realbuffer);
837 break;
838 case 'u':
839 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
840 if (longflag)
841 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
842 else if (size_tflag)
843 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
844 else
845 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
846 appendstring(realbuffer);
847 break;
848 case 'i':
849 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
850 sprintf(realbuffer, fmt, va_arg(vargs, int));
851 appendstring(realbuffer);
852 break;
853 case 'x':
854 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
855 sprintf(realbuffer, fmt, va_arg(vargs, int));
856 appendstring(realbuffer);
857 break;
858 case 's':
859 {
860 /* Parameter must be UTF-8 encoded.
861 In case of encoding errors, use
862 the replacement character. */
863 PyObject *u;
864 p = va_arg(vargs, char*);
865 u = PyUnicode_DecodeUTF8(p, strlen(p),
866 "replace");
867 if (!u)
868 goto fail;
869 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
870 PyUnicode_GET_SIZE(u));
871 s += PyUnicode_GET_SIZE(u);
872 Py_DECREF(u);
873 break;
874 }
875 case 'U':
876 {
877 PyObject *obj = va_arg(vargs, PyObject *);
878 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
879 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
880 s += size;
881 break;
882 }
883 case 'V':
884 {
885 PyObject *obj = va_arg(vargs, PyObject *);
886 const char *str = va_arg(vargs, const char *);
887 if (obj) {
888 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
889 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
890 s += size;
891 } else {
892 appendstring(str);
893 }
894 break;
895 }
896 case 'S':
897 case 'R':
898 {
899 Py_UNICODE *ucopy;
900 Py_ssize_t usize;
901 Py_ssize_t upos;
902 /* unused, since we already have the result */
903 (void) va_arg(vargs, PyObject *);
904 ucopy = PyUnicode_AS_UNICODE(*callresult);
905 usize = PyUnicode_GET_SIZE(*callresult);
906 for (upos = 0; upos<usize;)
907 *s++ = ucopy[upos++];
908 /* We're done with the unicode()/repr() => forget it */
909 Py_DECREF(*callresult);
910 /* switch to next unicode()/repr() result */
911 ++callresult;
912 break;
913 }
914 case 'p':
915 sprintf(buffer, "%p", va_arg(vargs, void*));
916 /* %p is ill-defined: ensure leading 0x. */
917 if (buffer[1] == 'X')
918 buffer[1] = 'x';
919 else if (buffer[1] != 'x') {
920 memmove(buffer+2, buffer, strlen(buffer)+1);
921 buffer[0] = '0';
922 buffer[1] = 'x';
923 }
924 appendstring(buffer);
925 break;
926 case '%':
927 *s++ = '%';
928 break;
929 default:
930 appendstring(p);
931 goto end;
932 }
933 } else
934 *s++ = *f;
935 }
936
937 end:
938 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000939 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000940 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000941 PyObject_Free(abuffer);
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000942 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000943 return string;
944 fail:
945 if (callresults) {
946 PyObject **callresult2 = callresults;
947 while (callresult2 < callresult) {
948 Py_DECREF(*callresult2);
949 ++callresult2;
950 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000951 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952 }
953 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000954 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000955 return NULL;
956}
957
958#undef appendstring
959
960PyObject *
961PyUnicode_FromFormat(const char *format, ...)
962{
963 PyObject* ret;
964 va_list vargs;
965
966#ifdef HAVE_STDARG_PROTOTYPES
967 va_start(vargs, format);
968#else
969 va_start(vargs);
970#endif
971 ret = PyUnicode_FromFormatV(format, vargs);
972 va_end(vargs);
973 return ret;
974}
975
Martin v. Löwis18e16552006-02-15 17:27:45 +0000976Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
977 wchar_t *w,
978 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979{
980 if (unicode == NULL) {
981 PyErr_BadInternalCall();
982 return -1;
983 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000984
985 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000987 size = PyUnicode_GET_SIZE(unicode) + 1;
988
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989#ifdef HAVE_USABLE_WCHAR_T
990 memcpy(w, unicode->str, size * sizeof(wchar_t));
991#else
992 {
993 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000994 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000995 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000996 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997 *w++ = *u++;
998 }
999#endif
1000
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001001 if (size > PyUnicode_GET_SIZE(unicode))
1002 return PyUnicode_GET_SIZE(unicode);
1003 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 return size;
1005}
1006
1007#endif
1008
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001009PyObject *PyUnicode_FromOrdinal(int ordinal)
1010{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001012
1013#ifdef Py_UNICODE_WIDE
1014 if (ordinal < 0 || ordinal > 0x10ffff) {
1015 PyErr_SetString(PyExc_ValueError,
1016 "unichr() arg not in range(0x110000) "
1017 "(wide Python build)");
1018 return NULL;
1019 }
1020#else
1021 if (ordinal < 0 || ordinal > 0xffff) {
1022 PyErr_SetString(PyExc_ValueError,
1023 "unichr() arg not in range(0x10000) "
1024 "(narrow Python build)");
1025 return NULL;
1026 }
1027#endif
1028
Hye-Shik Chang40574832004-04-06 07:24:51 +00001029 s[0] = (Py_UNICODE)ordinal;
1030 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001031}
1032
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033PyObject *PyUnicode_FromObject(register PyObject *obj)
1034{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001035 /* XXX Perhaps we should make this API an alias of
1036 PyObject_Unicode() instead ?! */
1037 if (PyUnicode_CheckExact(obj)) {
1038 Py_INCREF(obj);
1039 return obj;
1040 }
1041 if (PyUnicode_Check(obj)) {
1042 /* For a Unicode subtype that's not a Unicode object,
1043 return a true Unicode object with the same data. */
1044 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1045 PyUnicode_GET_SIZE(obj));
1046 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1048}
1049
1050PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1051 const char *encoding,
1052 const char *errors)
1053{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001054 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001055 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001057
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 if (obj == NULL) {
1059 PyErr_BadInternalCall();
1060 return NULL;
1061 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001062
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#if 0
1064 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001065 that no encodings is given and then redirect to
1066 PyObject_Unicode() which then applies the additional logic for
1067 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001068
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069 NOTE: This API should really only be used for object which
1070 represent *encoded* Unicode !
1071
1072 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001073 if (PyUnicode_Check(obj)) {
1074 if (encoding) {
1075 PyErr_SetString(PyExc_TypeError,
1076 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001078 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001080 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081#else
1082 if (PyUnicode_Check(obj)) {
1083 PyErr_SetString(PyExc_TypeError,
1084 "decoding Unicode is not supported");
1085 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001086 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001087#endif
1088
1089 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001090 if (PyString_Check(obj)) {
1091 s = PyString_AS_STRING(obj);
1092 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001093 }
Christian Heimes3497f942008-05-26 12:29:14 +00001094 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001095 /* Python 2.x specific */
1096 PyErr_Format(PyExc_TypeError,
1097 "decoding bytearray is not supported");
1098 return NULL;
1099 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001100 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1101 /* Overwrite the error message with something more useful in
1102 case of a TypeError. */
1103 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001105 "coercing to Unicode: need string or buffer, "
1106 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001107 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 goto onError;
1109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (len == 0) {
1113 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 }
Tim Petersced69f82003-09-16 20:30:58 +00001116 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001118
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 return v;
1120
1121 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123}
1124
1125PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001126 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 const char *encoding,
1128 const char *errors)
1129{
1130 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001131
1132 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001133 encoding = PyUnicode_GetDefaultEncoding();
1134
1135 /* Shortcuts for common default encodings */
1136 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001138 else if (strcmp(encoding, "latin-1") == 0)
1139 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001140#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1141 else if (strcmp(encoding, "mbcs") == 0)
1142 return PyUnicode_DecodeMBCS(s, size, errors);
1143#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001144 else if (strcmp(encoding, "ascii") == 0)
1145 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
1147 /* Decode via the codec registry */
1148 buffer = PyBuffer_FromMemory((void *)s, size);
1149 if (buffer == NULL)
1150 goto onError;
1151 unicode = PyCodec_Decode(buffer, encoding, errors);
1152 if (unicode == NULL)
1153 goto onError;
1154 if (!PyUnicode_Check(unicode)) {
1155 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001156 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001157 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 Py_DECREF(unicode);
1159 goto onError;
1160 }
1161 Py_DECREF(buffer);
1162 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001163
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 onError:
1165 Py_XDECREF(buffer);
1166 return NULL;
1167}
1168
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001169PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1172{
1173 PyObject *v;
1174
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1178 }
1179
1180 if (encoding == NULL)
1181 encoding = PyUnicode_GetDefaultEncoding();
1182
1183 /* Decode via the codec registry */
1184 v = PyCodec_Decode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1188
1189 onError:
1190 return NULL;
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001194 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 const char *encoding,
1196 const char *errors)
1197{
1198 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001199
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 unicode = PyUnicode_FromUnicode(s, size);
1201 if (unicode == NULL)
1202 return NULL;
1203 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1204 Py_DECREF(unicode);
1205 return v;
1206}
1207
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001208PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1211{
1212 PyObject *v;
1213
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1217 }
1218
1219 if (encoding == NULL)
1220 encoding = PyUnicode_GetDefaultEncoding();
1221
1222 /* Encode via the codec registry */
1223 v = PyCodec_Encode(unicode, encoding, errors);
1224 if (v == NULL)
1225 goto onError;
1226 return v;
1227
1228 onError:
1229 return NULL;
1230}
1231
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1235{
1236 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
Fred Drakee4315f52000-05-09 19:53:39 +00001242
Tim Petersced69f82003-09-16 20:30:58 +00001243 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001244 encoding = PyUnicode_GetDefaultEncoding();
1245
1246 /* Shortcuts for common default encodings */
1247 if (errors == NULL) {
1248 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001249 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001250 else if (strcmp(encoding, "latin-1") == 0)
1251 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001252#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1253 else if (strcmp(encoding, "mbcs") == 0)
1254 return PyUnicode_AsMBCSString(unicode);
1255#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001256 else if (strcmp(encoding, "ascii") == 0)
1257 return PyUnicode_AsASCIIString(unicode);
1258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259
1260 /* Encode via the codec registry */
1261 v = PyCodec_Encode(unicode, encoding, errors);
1262 if (v == NULL)
1263 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001264 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001266 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001267 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 Py_DECREF(v);
1269 goto onError;
1270 }
1271 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 onError:
1274 return NULL;
1275}
1276
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001277PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1278 const char *errors)
1279{
1280 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1281
1282 if (v)
1283 return v;
1284 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1285 if (v && errors == NULL)
1286 ((PyUnicodeObject *)unicode)->defenc = v;
1287 return v;
1288}
1289
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1291{
1292 if (!PyUnicode_Check(unicode)) {
1293 PyErr_BadArgument();
1294 goto onError;
1295 }
1296 return PyUnicode_AS_UNICODE(unicode);
1297
1298 onError:
1299 return NULL;
1300}
1301
Martin v. Löwis18e16552006-02-15 17:27:45 +00001302Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 if (!PyUnicode_Check(unicode)) {
1305 PyErr_BadArgument();
1306 goto onError;
1307 }
1308 return PyUnicode_GET_SIZE(unicode);
1309
1310 onError:
1311 return -1;
1312}
1313
Thomas Wouters78890102000-07-22 19:25:51 +00001314const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001315{
1316 return unicode_default_encoding;
1317}
1318
1319int PyUnicode_SetDefaultEncoding(const char *encoding)
1320{
1321 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001322
Fred Drakee4315f52000-05-09 19:53:39 +00001323 /* Make sure the encoding is valid. As side effect, this also
1324 loads the encoding into the codec registry cache. */
1325 v = _PyCodec_Lookup(encoding);
1326 if (v == NULL)
1327 goto onError;
1328 Py_DECREF(v);
1329 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001330 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001331 sizeof(unicode_default_encoding));
1332 return 0;
1333
1334 onError:
1335 return -1;
1336}
1337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338/* error handling callback helper:
1339 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001340 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 and adjust various state variables.
1342 return 0 on success, -1 on error
1343*/
1344
1345static
1346int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1347 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001348 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1349 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00001350 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001352 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353
1354 PyObject *restuple = NULL;
1355 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1357 Py_ssize_t requiredsize;
1358 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001361 int res = -1;
1362
1363 if (*errorHandler == NULL) {
1364 *errorHandler = PyCodec_LookupError(errors);
1365 if (*errorHandler == NULL)
1366 goto onError;
1367 }
1368
1369 if (*exceptionObject == NULL) {
1370 *exceptionObject = PyUnicodeDecodeError_Create(
1371 encoding, input, insize, *startinpos, *endinpos, reason);
1372 if (*exceptionObject == NULL)
1373 goto onError;
1374 }
1375 else {
1376 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1377 goto onError;
1378 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1379 goto onError;
1380 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1381 goto onError;
1382 }
1383
1384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1385 if (restuple == NULL)
1386 goto onError;
1387 if (!PyTuple_Check(restuple)) {
1388 PyErr_Format(PyExc_TypeError, &argparse[4]);
1389 goto onError;
1390 }
1391 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1392 goto onError;
1393 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001394 newpos = insize+newpos;
1395 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001397 goto onError;
1398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399
1400 /* need more space? (at least enough for what we
1401 have+the replacement+the rest of the string (starting
1402 at the new input position), so we won't have to check space
1403 when there are no errors in the rest of the string) */
1404 repptr = PyUnicode_AS_UNICODE(repunicode);
1405 repsize = PyUnicode_GET_SIZE(repunicode);
1406 requiredsize = *outpos + repsize + insize-newpos;
1407 if (requiredsize > outsize) {
1408 if (requiredsize<2*outsize)
1409 requiredsize = 2*outsize;
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00001410 if (_PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001411 goto onError;
1412 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1413 }
1414 *endinpos = newpos;
1415 *inptr = input + newpos;
1416 Py_UNICODE_COPY(*outptr, repptr, repsize);
1417 *outptr += repsize;
1418 *outpos += repsize;
1419 /* we made it! */
1420 res = 0;
1421
1422 onError:
1423 Py_XDECREF(restuple);
1424 return res;
1425}
1426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427/* --- UTF-7 Codec -------------------------------------------------------- */
1428
1429/* see RFC2152 for details */
1430
Tim Petersced69f82003-09-16 20:30:58 +00001431static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432char utf7_special[128] = {
1433 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1434 encoded:
1435 0 - not special
1436 1 - special
1437 2 - whitespace (optional)
1438 3 - RFC2152 Set O (optional) */
1439 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1440 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1441 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1445 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1447
1448};
1449
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1451 warnings about the comparison always being false; since
1452 utf7_special[0] is 1, we can safely make that one comparison
1453 true */
1454
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001455#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001456 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001457 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 (encodeO && (utf7_special[(c)] == 3)))
1459
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001460#define B64(n) \
1461 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1462#define B64CHAR(c) \
1463 (isalnum(c) || (c) == '+' || (c) == '/')
1464#define UB64(c) \
1465 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1466 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468#define ENCODE(out, ch, bits) \
1469 while (bits >= 6) { \
1470 *out++ = B64(ch >> (bits-6)); \
1471 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 }
1473
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001474#define DECODE(out, ch, bits, surrogate) \
1475 while (bits >= 16) { \
1476 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1477 bits -= 16; \
1478 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001479 /* We have already generated an error for the high surrogate \
1480 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001481 surrogate = 0; \
1482 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001484 it in a 16-bit character */ \
1485 surrogate = 1; \
1486 errmsg = "code pairs are not supported"; \
1487 goto utf7Error; \
1488 } else { \
1489 *out++ = outCh; \
1490 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001494 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 const char *errors)
1496{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001497 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1498}
1499
1500PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1501 Py_ssize_t size,
1502 const char *errors,
1503 Py_ssize_t *consumed)
1504{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001506 Py_ssize_t startinpos;
1507 Py_ssize_t endinpos;
1508 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 const char *e;
1510 PyUnicodeObject *unicode;
1511 Py_UNICODE *p;
1512 const char *errmsg = "";
1513 int inShift = 0;
1514 unsigned int bitsleft = 0;
1515 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 int surrogate = 0;
1517 PyObject *errorHandler = NULL;
1518 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519
1520 unicode = _PyUnicode_New(size);
1521 if (!unicode)
1522 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001523 if (size == 0) {
1524 if (consumed)
1525 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001527 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528
1529 p = unicode->str;
1530 e = s + size;
1531
1532 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 Py_UNICODE ch;
1534 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001535 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536
1537 if (inShift) {
1538 if ((ch == '-') || !B64CHAR(ch)) {
1539 inShift = 0;
1540 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001541
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543 if (bitsleft >= 6) {
1544 /* The shift sequence has a partial character in it. If
1545 bitsleft < 6 then we could just classify it as padding
1546 but that is not the case here */
1547
1548 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001549 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 }
1551 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001552 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 here so indicate the potential of a misencoded character. */
1554
1555 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1556 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1557 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001558 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 }
1560
1561 if (ch == '-') {
1562 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001563 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 inShift = 1;
1565 }
1566 } else if (SPECIAL(ch,0,0)) {
1567 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001568 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 } else {
1570 *p++ = ch;
1571 }
1572 } else {
1573 charsleft = (charsleft << 6) | UB64(ch);
1574 bitsleft += 6;
1575 s++;
1576 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1577 }
1578 }
1579 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001580 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 s++;
1582 if (s < e && *s == '-') {
1583 s++;
1584 *p++ = '+';
1585 } else
1586 {
1587 inShift = 1;
1588 bitsleft = 0;
1589 }
1590 }
1591 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001592 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 errmsg = "unexpected special character";
1594 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001595 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001596 }
1597 else {
1598 *p++ = ch;
1599 s++;
1600 }
1601 continue;
1602 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 outpos = p-PyUnicode_AS_UNICODE(unicode);
1604 endinpos = s-starts;
1605 if (unicode_decode_call_errorhandler(
1606 errors, &errorHandler,
1607 "utf7", errmsg,
1608 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00001609 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 }
1612
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001613 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 outpos = p-PyUnicode_AS_UNICODE(unicode);
1615 endinpos = size;
1616 if (unicode_decode_call_errorhandler(
1617 errors, &errorHandler,
1618 "utf7", "unterminated shift sequence",
1619 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00001620 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 if (s < e)
1623 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001625 if (consumed) {
1626 if(inShift)
1627 *consumed = startinpos;
1628 else
1629 *consumed = s-starts;
1630 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001632 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633 goto onError;
1634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 Py_XDECREF(errorHandler);
1636 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 return (PyObject *)unicode;
1638
1639onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_XDECREF(errorHandler);
1641 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 Py_DECREF(unicode);
1643 return NULL;
1644}
1645
1646
1647PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001648 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 int encodeSetO,
1650 int encodeWhiteSpace,
1651 const char *errors)
1652{
1653 PyObject *v;
1654 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001655 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001657 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 unsigned int bitsleft = 0;
1659 unsigned long charsleft = 0;
1660 char * out;
1661 char * start;
1662
Neal Norwitze7d8be82008-07-31 17:17:14 +00001663 if (cbAllocated / 5 != size)
1664 return PyErr_NoMemory();
1665
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 if (size == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001667 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001669 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 if (v == NULL)
1671 return NULL;
1672
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001673 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 for (;i < size; ++i) {
1675 Py_UNICODE ch = s[i];
1676
1677 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001678 if (ch == '+') {
1679 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 *out++ = '-';
1681 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 charsleft = ch;
1683 bitsleft = 16;
1684 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001685 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001687 } else {
1688 *out++ = (char) ch;
1689 }
1690 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1692 *out++ = B64(charsleft << (6-bitsleft));
1693 charsleft = 0;
1694 bitsleft = 0;
1695 /* Characters not in the BASE64 set implicitly unshift the sequence
1696 so no '-' is required, except if the character is itself a '-' */
1697 if (B64CHAR(ch) || ch == '-') {
1698 *out++ = '-';
1699 }
1700 inShift = 0;
1701 *out++ = (char) ch;
1702 } else {
1703 bitsleft += 16;
1704 charsleft = (charsleft << 16) | ch;
1705 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1706
1707 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001708 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 or '-' then the shift sequence will be terminated implicitly and we
1710 don't have to insert a '-'. */
1711
1712 if (bitsleft == 0) {
1713 if (i + 1 < size) {
1714 Py_UNICODE ch2 = s[i+1];
1715
1716 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001717
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 } else if (B64CHAR(ch2) || ch2 == '-') {
1719 *out++ = '-';
1720 inShift = 0;
1721 } else {
1722 inShift = 0;
1723 }
1724
1725 }
1726 else {
1727 *out++ = '-';
1728 inShift = 0;
1729 }
1730 }
Tim Petersced69f82003-09-16 20:30:58 +00001731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001733 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 if (bitsleft) {
1735 *out++= B64(charsleft << (6-bitsleft) );
1736 *out++ = '-';
1737 }
1738
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001739 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 return v;
1741}
1742
1743#undef SPECIAL
1744#undef B64
1745#undef B64CHAR
1746#undef UB64
1747#undef ENCODE
1748#undef DECODE
1749
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750/* --- UTF-8 Codec -------------------------------------------------------- */
1751
Tim Petersced69f82003-09-16 20:30:58 +00001752static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753char utf8_code_length[256] = {
1754 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1755 illegal prefix. see RFC 2279 for details */
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1767 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1768 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1769 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1770 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1771 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1772};
1773
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001775 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 const char *errors)
1777{
Walter Dörwald69652032004-09-07 20:24:22 +00001778 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1779}
1780
1781PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001783 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t startinpos;
1789 Py_ssize_t endinpos;
1790 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 const char *e;
1792 PyUnicodeObject *unicode;
1793 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 PyObject *errorHandler = NULL;
1796 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797
1798 /* Note: size will always be longer than the resulting Unicode
1799 character count */
1800 unicode = _PyUnicode_New(size);
1801 if (!unicode)
1802 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001803 if (size == 0) {
1804 if (consumed)
1805 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 /* Unpack UTF-8 encoded data */
1810 p = unicode->str;
1811 e = s + size;
1812
1813 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001814 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815
1816 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001817 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 s++;
1819 continue;
1820 }
1821
1822 n = utf8_code_length[ch];
1823
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001825 if (consumed)
1826 break;
1827 else {
1828 errmsg = "unexpected end of data";
1829 startinpos = s-starts;
1830 endinpos = size;
1831 goto utf8Error;
1832 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834
1835 switch (n) {
1836
1837 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 startinpos = s-starts;
1840 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842
1843 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848
1849 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 if ((s[1] & 0xc0) != 0x80) {
1851 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 startinpos = s-starts;
1853 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001854 goto utf8Error;
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 errmsg = "illegal encoding";
1861 goto utf8Error;
1862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 break;
1866
1867 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001868 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 (s[2] & 0xc0) != 0x80) {
1870 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 startinpos = s-starts;
1872 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 goto utf8Error;
1874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001876 if (ch < 0x0800) {
1877 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001878 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001879
1880 XXX For wide builds (UCS-4) we should probably try
1881 to recombine the surrogates into a single code
1882 unit.
1883 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 startinpos = s-starts;
1886 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001887 goto utf8Error;
1888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001890 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 break;
1892
1893 case 4:
1894 if ((s[1] & 0xc0) != 0x80 ||
1895 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 (s[3] & 0xc0) != 0x80) {
1897 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 startinpos = s-starts;
1899 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001900 goto utf8Error;
1901 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1903 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1904 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001906 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001908 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001909 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 startinpos = s-starts;
1912 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001913 goto utf8Error;
1914 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001915#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001916 *p++ = (Py_UNICODE)ch;
1917#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001919
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001920 /* translate from 10000..10FFFF to 0..FFFF */
1921 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001922
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001923 /* high surrogate = top 10 bits added to D800 */
1924 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001925
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001927 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001928#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 break;
1930
1931 default:
1932 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 startinpos = s-starts;
1935 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
1938 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001941 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001942 outpos = p-PyUnicode_AS_UNICODE(unicode);
1943 if (unicode_decode_call_errorhandler(
1944 errors, &errorHandler,
1945 "utf8", errmsg,
1946 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00001947 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 }
Walter Dörwald69652032004-09-07 20:24:22 +00001950 if (consumed)
1951 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952
1953 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001954 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 goto onError;
1956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 Py_XDECREF(errorHandler);
1958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 return (PyObject *)unicode;
1960
1961onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 Py_XDECREF(errorHandler);
1963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 Py_DECREF(unicode);
1965 return NULL;
1966}
1967
Tim Peters602f7402002-04-27 18:03:26 +00001968/* Allocation strategy: if the string is short, convert into a stack buffer
1969 and allocate exactly as much space needed at the end. Else allocate the
1970 maximum possible needed (4 result bytes per Unicode character), and return
1971 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001972*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001973PyObject *
1974PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001975 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001976 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977{
Tim Peters602f7402002-04-27 18:03:26 +00001978#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001979
Martin v. Löwis18e16552006-02-15 17:27:45 +00001980 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001981 PyObject *v; /* result string object */
1982 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001984 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001985 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001986
Tim Peters602f7402002-04-27 18:03:26 +00001987 assert(s != NULL);
1988 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
Tim Peters602f7402002-04-27 18:03:26 +00001990 if (size <= MAX_SHORT_UNICHARS) {
1991 /* Write into the stack buffer; nallocated can't overflow.
1992 * At the end, we'll allocate exactly as much heap space as it
1993 * turns out we need.
1994 */
1995 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1996 v = NULL; /* will allocate after we're done */
1997 p = stackbuf;
1998 }
1999 else {
2000 /* Overallocate on the heap, and give the excess back at the end. */
2001 nallocated = size * 4;
2002 if (nallocated / 4 != size) /* overflow! */
2003 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002004 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002005 if (v == NULL)
2006 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002007 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002008 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002009
Tim Peters602f7402002-04-27 18:03:26 +00002010 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002012
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002013 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002014 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002016
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002018 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002019 *p++ = (char)(0xc0 | (ch >> 6));
2020 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002022 else {
Tim Peters602f7402002-04-27 18:03:26 +00002023 /* Encode UCS2 Unicode ordinals */
2024 if (ch < 0x10000) {
2025 /* Special case: check for high surrogate */
2026 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2027 Py_UCS4 ch2 = s[i];
2028 /* Check for low surrogate and combine the two to
2029 form a UCS4 value */
2030 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002031 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002032 i++;
2033 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002034 }
Tim Peters602f7402002-04-27 18:03:26 +00002035 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002036 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002037 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002038 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2039 *p++ = (char)(0x80 | (ch & 0x3f));
2040 continue;
2041 }
2042encodeUCS4:
2043 /* Encode UCS4 Unicode ordinals */
2044 *p++ = (char)(0xf0 | (ch >> 18));
2045 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2046 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2047 *p++ = (char)(0x80 | (ch & 0x3f));
2048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002050
Tim Peters602f7402002-04-27 18:03:26 +00002051 if (v == NULL) {
2052 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002053 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002054 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002055 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002056 }
2057 else {
2058 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002059 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002060 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002061 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002064
Tim Peters602f7402002-04-27 18:03:26 +00002065#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066}
2067
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2069{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 if (!PyUnicode_Check(unicode)) {
2071 PyErr_BadArgument();
2072 return NULL;
2073 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002074 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2075 PyUnicode_GET_SIZE(unicode),
2076 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077}
2078
Walter Dörwald6e390802007-08-17 16:41:28 +00002079/* --- UTF-32 Codec ------------------------------------------------------- */
2080
2081PyObject *
2082PyUnicode_DecodeUTF32(const char *s,
2083 Py_ssize_t size,
2084 const char *errors,
2085 int *byteorder)
2086{
2087 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2088}
2089
2090PyObject *
2091PyUnicode_DecodeUTF32Stateful(const char *s,
2092 Py_ssize_t size,
2093 const char *errors,
2094 int *byteorder,
2095 Py_ssize_t *consumed)
2096{
2097 const char *starts = s;
2098 Py_ssize_t startinpos;
2099 Py_ssize_t endinpos;
2100 Py_ssize_t outpos;
2101 PyUnicodeObject *unicode;
2102 Py_UNICODE *p;
2103#ifndef Py_UNICODE_WIDE
2104 int i, pairs;
2105#else
2106 const int pairs = 0;
2107#endif
2108 const unsigned char *q, *e;
2109 int bo = 0; /* assume native ordering by default */
2110 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002111 /* Offsets from q for retrieving bytes in the right order. */
2112#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2113 int iorder[] = {0, 1, 2, 3};
2114#else
2115 int iorder[] = {3, 2, 1, 0};
2116#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002117 PyObject *errorHandler = NULL;
2118 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002119 /* On narrow builds we split characters outside the BMP into two
2120 codepoints => count how much extra space we need. */
2121#ifndef Py_UNICODE_WIDE
2122 for (i = pairs = 0; i < size/4; i++)
2123 if (((Py_UCS4 *)s)[i] >= 0x10000)
2124 pairs++;
2125#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002126
2127 /* This might be one to much, because of a BOM */
2128 unicode = _PyUnicode_New((size+3)/4+pairs);
2129 if (!unicode)
2130 return NULL;
2131 if (size == 0)
2132 return (PyObject *)unicode;
2133
2134 /* Unpack UTF-32 encoded data */
2135 p = unicode->str;
2136 q = (unsigned char *)s;
2137 e = q + size;
2138
2139 if (byteorder)
2140 bo = *byteorder;
2141
2142 /* Check for BOM marks (U+FEFF) in the input and adjust current
2143 byte order setting accordingly. In native mode, the leading BOM
2144 mark is skipped, in all other modes, it is copied to the output
2145 stream as-is (giving a ZWNBSP character). */
2146 if (bo == 0) {
2147 if (size >= 4) {
2148 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2149 (q[iorder[1]] << 8) | q[iorder[0]];
2150#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2151 if (bom == 0x0000FEFF) {
2152 q += 4;
2153 bo = -1;
2154 }
2155 else if (bom == 0xFFFE0000) {
2156 q += 4;
2157 bo = 1;
2158 }
2159#else
2160 if (bom == 0x0000FEFF) {
2161 q += 4;
2162 bo = 1;
2163 }
2164 else if (bom == 0xFFFE0000) {
2165 q += 4;
2166 bo = -1;
2167 }
2168#endif
2169 }
2170 }
2171
2172 if (bo == -1) {
2173 /* force LE */
2174 iorder[0] = 0;
2175 iorder[1] = 1;
2176 iorder[2] = 2;
2177 iorder[3] = 3;
2178 }
2179 else if (bo == 1) {
2180 /* force BE */
2181 iorder[0] = 3;
2182 iorder[1] = 2;
2183 iorder[2] = 1;
2184 iorder[3] = 0;
2185 }
2186
2187 while (q < e) {
2188 Py_UCS4 ch;
2189 /* remaining bytes at the end? (size should be divisible by 4) */
2190 if (e-q<4) {
2191 if (consumed)
2192 break;
2193 errmsg = "truncated data";
2194 startinpos = ((const char *)q)-starts;
2195 endinpos = ((const char *)e)-starts;
2196 goto utf32Error;
2197 /* The remaining input chars are ignored if the callback
2198 chooses to skip the input */
2199 }
2200 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2201 (q[iorder[1]] << 8) | q[iorder[0]];
2202
2203 if (ch >= 0x110000)
2204 {
2205 errmsg = "codepoint not in range(0x110000)";
2206 startinpos = ((const char *)q)-starts;
2207 endinpos = startinpos+4;
2208 goto utf32Error;
2209 }
2210#ifndef Py_UNICODE_WIDE
2211 if (ch >= 0x10000)
2212 {
2213 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2214 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2215 }
2216 else
2217#endif
2218 *p++ = ch;
2219 q += 4;
2220 continue;
2221 utf32Error:
2222 outpos = p-PyUnicode_AS_UNICODE(unicode);
2223 if (unicode_decode_call_errorhandler(
2224 errors, &errorHandler,
2225 "utf32", errmsg,
2226 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002227 &unicode, &outpos, &p))
Walter Dörwald6e390802007-08-17 16:41:28 +00002228 goto onError;
2229 }
2230
2231 if (byteorder)
2232 *byteorder = bo;
2233
2234 if (consumed)
2235 *consumed = (const char *)q-starts;
2236
2237 /* Adjust length */
2238 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2239 goto onError;
2240
2241 Py_XDECREF(errorHandler);
2242 Py_XDECREF(exc);
2243 return (PyObject *)unicode;
2244
2245onError:
2246 Py_DECREF(unicode);
2247 Py_XDECREF(errorHandler);
2248 Py_XDECREF(exc);
2249 return NULL;
2250}
2251
2252PyObject *
2253PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2254 Py_ssize_t size,
2255 const char *errors,
2256 int byteorder)
2257{
2258 PyObject *v;
2259 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002260 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002261#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002262 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002263#else
2264 const int pairs = 0;
2265#endif
2266 /* Offsets from p for storing byte pairs in the right order. */
2267#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2268 int iorder[] = {0, 1, 2, 3};
2269#else
2270 int iorder[] = {3, 2, 1, 0};
2271#endif
2272
2273#define STORECHAR(CH) \
2274 do { \
2275 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2276 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2277 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2278 p[iorder[0]] = (CH) & 0xff; \
2279 p += 4; \
2280 } while(0)
2281
2282 /* In narrow builds we can output surrogate pairs as one codepoint,
2283 so we need less space. */
2284#ifndef Py_UNICODE_WIDE
2285 for (i = pairs = 0; i < size-1; i++)
2286 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2287 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2288 pairs++;
2289#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002290 nsize = (size - pairs + (byteorder == 0));
2291 bytesize = nsize * 4;
2292 if (bytesize / 4 != nsize)
2293 return PyErr_NoMemory();
2294 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002295 if (v == NULL)
2296 return NULL;
2297
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002298 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002299 if (byteorder == 0)
2300 STORECHAR(0xFEFF);
2301 if (size == 0)
2302 return v;
2303
2304 if (byteorder == -1) {
2305 /* force LE */
2306 iorder[0] = 0;
2307 iorder[1] = 1;
2308 iorder[2] = 2;
2309 iorder[3] = 3;
2310 }
2311 else if (byteorder == 1) {
2312 /* force BE */
2313 iorder[0] = 3;
2314 iorder[1] = 2;
2315 iorder[2] = 1;
2316 iorder[3] = 0;
2317 }
2318
2319 while (size-- > 0) {
2320 Py_UCS4 ch = *s++;
2321#ifndef Py_UNICODE_WIDE
2322 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2323 Py_UCS4 ch2 = *s;
2324 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2325 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2326 s++;
2327 size--;
2328 }
2329 }
2330#endif
2331 STORECHAR(ch);
2332 }
2333 return v;
2334#undef STORECHAR
2335}
2336
2337PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2338{
2339 if (!PyUnicode_Check(unicode)) {
2340 PyErr_BadArgument();
2341 return NULL;
2342 }
2343 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2344 PyUnicode_GET_SIZE(unicode),
2345 NULL,
2346 0);
2347}
2348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349/* --- UTF-16 Codec ------------------------------------------------------- */
2350
Tim Peters772747b2001-08-09 22:21:55 +00002351PyObject *
2352PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002353 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002354 const char *errors,
2355 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356{
Walter Dörwald69652032004-09-07 20:24:22 +00002357 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2358}
2359
2360PyObject *
2361PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002362 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002363 const char *errors,
2364 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002365 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002367 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t startinpos;
2369 Py_ssize_t endinpos;
2370 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 PyUnicodeObject *unicode;
2372 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002373 const unsigned char *q, *e;
2374 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002376 /* Offsets from q for retrieving byte pairs in the right order. */
2377#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2378 int ihi = 1, ilo = 0;
2379#else
2380 int ihi = 0, ilo = 1;
2381#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002382 PyObject *errorHandler = NULL;
2383 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
2385 /* Note: size will always be longer than the resulting Unicode
2386 character count */
2387 unicode = _PyUnicode_New(size);
2388 if (!unicode)
2389 return NULL;
2390 if (size == 0)
2391 return (PyObject *)unicode;
2392
2393 /* Unpack UTF-16 encoded data */
2394 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002395 q = (unsigned char *)s;
2396 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397
2398 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002399 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002401 /* Check for BOM marks (U+FEFF) in the input and adjust current
2402 byte order setting accordingly. In native mode, the leading BOM
2403 mark is skipped, in all other modes, it is copied to the output
2404 stream as-is (giving a ZWNBSP character). */
2405 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002406 if (size >= 2) {
2407 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002408#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002409 if (bom == 0xFEFF) {
2410 q += 2;
2411 bo = -1;
2412 }
2413 else if (bom == 0xFFFE) {
2414 q += 2;
2415 bo = 1;
2416 }
Tim Petersced69f82003-09-16 20:30:58 +00002417#else
Walter Dörwald69652032004-09-07 20:24:22 +00002418 if (bom == 0xFEFF) {
2419 q += 2;
2420 bo = 1;
2421 }
2422 else if (bom == 0xFFFE) {
2423 q += 2;
2424 bo = -1;
2425 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002426#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002427 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429
Tim Peters772747b2001-08-09 22:21:55 +00002430 if (bo == -1) {
2431 /* force LE */
2432 ihi = 1;
2433 ilo = 0;
2434 }
2435 else if (bo == 1) {
2436 /* force BE */
2437 ihi = 0;
2438 ilo = 1;
2439 }
2440
2441 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002443 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002444 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002445 if (consumed)
2446 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002447 errmsg = "truncated data";
2448 startinpos = ((const char *)q)-starts;
2449 endinpos = ((const char *)e)-starts;
2450 goto utf16Error;
2451 /* The remaining input chars are ignored if the callback
2452 chooses to skip the input */
2453 }
2454 ch = (q[ihi] << 8) | q[ilo];
2455
Tim Peters772747b2001-08-09 22:21:55 +00002456 q += 2;
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 if (ch < 0xD800 || ch > 0xDFFF) {
2459 *p++ = ch;
2460 continue;
2461 }
2462
2463 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002464 if (q >= e) {
2465 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 startinpos = (((const char *)q)-2)-starts;
2467 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002468 goto utf16Error;
2469 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002470 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002471 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2472 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002473 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002474#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002475 *p++ = ch;
2476 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477#else
2478 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002479#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002480 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002481 }
2482 else {
2483 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 startinpos = (((const char *)q)-4)-starts;
2485 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002486 goto utf16Error;
2487 }
2488
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002490 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 startinpos = (((const char *)q)-2)-starts;
2492 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002493 /* Fall through to report the error */
2494
2495 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002496 outpos = p-PyUnicode_AS_UNICODE(unicode);
2497 if (unicode_decode_call_errorhandler(
2498 errors, &errorHandler,
2499 "utf16", errmsg,
2500 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002501 &unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002502 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 }
2504
2505 if (byteorder)
2506 *byteorder = bo;
2507
Walter Dörwald69652032004-09-07 20:24:22 +00002508 if (consumed)
2509 *consumed = (const char *)q-starts;
2510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002512 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 goto onError;
2514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002515 Py_XDECREF(errorHandler);
2516 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 return (PyObject *)unicode;
2518
2519onError:
2520 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 return NULL;
2524}
2525
Tim Peters772747b2001-08-09 22:21:55 +00002526PyObject *
2527PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002528 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002529 const char *errors,
2530 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531{
2532 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002533 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002534 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002536 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002537#else
2538 const int pairs = 0;
2539#endif
Tim Peters772747b2001-08-09 22:21:55 +00002540 /* Offsets from p for storing byte pairs in the right order. */
2541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2542 int ihi = 1, ilo = 0;
2543#else
2544 int ihi = 0, ilo = 1;
2545#endif
2546
2547#define STORECHAR(CH) \
2548 do { \
2549 p[ihi] = ((CH) >> 8) & 0xff; \
2550 p[ilo] = (CH) & 0xff; \
2551 p += 2; \
2552 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002554#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002555 for (i = pairs = 0; i < size; i++)
2556 if (s[i] >= 0x10000)
2557 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002558#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002559 /* 2 * (size + pairs + (byteorder == 0)) */
2560 if (size > PY_SSIZE_T_MAX ||
2561 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2562 return PyErr_NoMemory();
2563 nsize = size + pairs + (byteorder == 0);
2564 bytesize = nsize * 2;
2565 if (bytesize / 2 != nsize)
2566 return PyErr_NoMemory();
2567 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 if (v == NULL)
2569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002571 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002573 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002574 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002575 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002576
2577 if (byteorder == -1) {
2578 /* force LE */
2579 ihi = 1;
2580 ilo = 0;
2581 }
2582 else if (byteorder == 1) {
2583 /* force BE */
2584 ihi = 0;
2585 ilo = 1;
2586 }
2587
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588 while (size-- > 0) {
2589 Py_UNICODE ch = *s++;
2590 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002591#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002593 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2594 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002596#endif
Tim Peters772747b2001-08-09 22:21:55 +00002597 STORECHAR(ch);
2598 if (ch2)
2599 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002602#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603}
2604
2605PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2606{
2607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_BadArgument();
2609 return NULL;
2610 }
2611 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2612 PyUnicode_GET_SIZE(unicode),
2613 NULL,
2614 0);
2615}
2616
2617/* --- Unicode Escape Codec ----------------------------------------------- */
2618
Fredrik Lundh06d12682001-01-24 07:59:11 +00002619static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002620
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 const char *errors)
2624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002626 Py_ssize_t startinpos;
2627 Py_ssize_t endinpos;
2628 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002633 char* message;
2634 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 PyObject *errorHandler = NULL;
2636 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 /* Escaped strings will always be longer than the resulting
2639 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 length after conversion to the true value.
2641 (but if the error callback returns a long replacement string
2642 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 v = _PyUnicode_New(size);
2644 if (v == NULL)
2645 goto onError;
2646 if (size == 0)
2647 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002651
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 while (s < end) {
2653 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002654 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656
2657 /* Non-escape characters are interpreted as Unicode ordinals */
2658 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002659 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 continue;
2661 }
2662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 /* \ - Escapes */
2665 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002666 c = *s++;
2667 if (s > end)
2668 c = '\0'; /* Invalid after \ */
2669 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670
2671 /* \x escapes */
2672 case '\n': break;
2673 case '\\': *p++ = '\\'; break;
2674 case '\'': *p++ = '\''; break;
2675 case '\"': *p++ = '\"'; break;
2676 case 'b': *p++ = '\b'; break;
2677 case 'f': *p++ = '\014'; break; /* FF */
2678 case 't': *p++ = '\t'; break;
2679 case 'n': *p++ = '\n'; break;
2680 case 'r': *p++ = '\r'; break;
2681 case 'v': *p++ = '\013'; break; /* VT */
2682 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2683
2684 /* \OOO (octal) escapes */
2685 case '0': case '1': case '2': case '3':
2686 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002687 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002688 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002690 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002691 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002693 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 break;
2695
Fredrik Lundhccc74732001-02-18 22:13:49 +00002696 /* hex escapes */
2697 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002699 digits = 2;
2700 message = "truncated \\xXX escape";
2701 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702
Fredrik Lundhccc74732001-02-18 22:13:49 +00002703 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 digits = 4;
2706 message = "truncated \\uXXXX escape";
2707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708
Fredrik Lundhccc74732001-02-18 22:13:49 +00002709 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002710 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 digits = 8;
2712 message = "truncated \\UXXXXXXXX escape";
2713 hexescape:
2714 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 outpos = p-PyUnicode_AS_UNICODE(v);
2716 if (s+digits>end) {
2717 endinpos = size;
2718 if (unicode_decode_call_errorhandler(
2719 errors, &errorHandler,
2720 "unicodeescape", "end of string in escape sequence",
2721 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002722 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 goto onError;
2724 goto nextByte;
2725 }
2726 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002727 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002728 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 endinpos = (s+i+1)-starts;
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "unicodeescape", message,
2733 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002734 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002737 }
2738 chr = (chr<<4) & ~0xF;
2739 if (c >= '0' && c <= '9')
2740 chr += c - '0';
2741 else if (c >= 'a' && c <= 'f')
2742 chr += 10 + c - 'a';
2743 else
2744 chr += 10 + c - 'A';
2745 }
2746 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002747 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 /* _decoding_error will have already written into the
2749 target buffer. */
2750 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 /* when we get here, chr is a 32-bit unicode character */
2753 if (chr <= 0xffff)
2754 /* UCS-2 character */
2755 *p++ = (Py_UNICODE) chr;
2756 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002757 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002758 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002759#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760 *p++ = chr;
2761#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002762 chr -= 0x10000L;
2763 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002764 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002766 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 endinpos = s-starts;
2768 outpos = p-PyUnicode_AS_UNICODE(v);
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "unicodeescape", "illegal Unicode character",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002773 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002774 goto onError;
2775 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002776 break;
2777
2778 /* \N{name} */
2779 case 'N':
2780 message = "malformed \\N character escape";
2781 if (ucnhash_CAPI == NULL) {
2782 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002783 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002784 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 if (m == NULL)
2786 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002787 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002789 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002790 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002792 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 if (ucnhash_CAPI == NULL)
2794 goto ucnhashError;
2795 }
2796 if (*s == '{') {
2797 const char *start = s+1;
2798 /* look for the closing brace */
2799 while (*s != '}' && s < end)
2800 s++;
2801 if (s > start && s < end && *s == '}') {
2802 /* found a name. look it up in the unicode database */
2803 message = "unknown Unicode character name";
2804 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 goto store;
2807 }
2808 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 endinpos = s-starts;
2810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (unicode_decode_call_errorhandler(
2812 errors, &errorHandler,
2813 "unicodeescape", message,
2814 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002815 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 break;
2818
2819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 message = "\\ at end of string";
2822 s--;
2823 endinpos = s-starts;
2824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (unicode_decode_call_errorhandler(
2826 errors, &errorHandler,
2827 "unicodeescape", message,
2828 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00002829 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002830 goto onError;
2831 }
2832 else {
2833 *p++ = '\\';
2834 *p++ = (unsigned char)s[-1];
2835 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 nextByte:
2839 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002841 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002843 Py_XDECREF(errorHandler);
2844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002846
Fredrik Lundhccc74732001-02-18 22:13:49 +00002847ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002848 PyErr_SetString(
2849 PyExc_UnicodeError,
2850 "\\N escapes not supported (can't load unicodedata module)"
2851 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002852 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 Py_XDECREF(errorHandler);
2854 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002855 return NULL;
2856
Fredrik Lundhccc74732001-02-18 22:13:49 +00002857onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 Py_XDECREF(errorHandler);
2860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 return NULL;
2862}
2863
2864/* Return a Unicode-Escape string version of the Unicode object.
2865
2866 If quotes is true, the string is enclosed in u"" or u'' quotes as
2867 appropriate.
2868
2869*/
2870
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002871Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002872 Py_ssize_t size,
2873 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002874{
2875 /* like wcschr, but doesn't stop at NULL characters */
2876
2877 while (size-- > 0) {
2878 if (*s == ch)
2879 return s;
2880 s++;
2881 }
2882
2883 return NULL;
2884}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886static
2887PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002888 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 int quotes)
2890{
2891 PyObject *repr;
2892 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002894 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002895#ifdef Py_UNICODE_WIDE
2896 const Py_ssize_t expandsize = 10;
2897#else
2898 const Py_ssize_t expandsize = 6;
2899#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
Neal Norwitz17753ec2006-08-21 22:21:19 +00002901 /* XXX(nnorwitz): rather than over-allocating, it would be
2902 better to choose a different scheme. Perhaps scan the
2903 first N-chars of the string and allocate based on that size.
2904 */
2905 /* Initial allocation is based on the longest-possible unichr
2906 escape.
2907
2908 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2909 unichr, so in this case it's the longest unichr escape. In
2910 narrow (UTF-16) builds this is five chars per source unichr
2911 since there are two unichrs in the surrogate pair, so in narrow
2912 (UTF-16) builds it's not the longest unichr escape.
2913
2914 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2915 so in the narrow (UTF-16) build case it's the longest unichr
2916 escape.
2917 */
2918
Neal Norwitze7d8be82008-07-31 17:17:14 +00002919 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2920 return PyErr_NoMemory();
2921
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002922 repr = PyString_FromStringAndSize(NULL,
Neal Norwitz17753ec2006-08-21 22:21:19 +00002923 2
Neal Norwitze7d8be82008-07-31 17:17:14 +00002924 + expandsize*size
Neal Norwitz17753ec2006-08-21 22:21:19 +00002925 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 if (repr == NULL)
2927 return NULL;
2928
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002929 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002933 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 !findchar(s, size, '"')) ? '"' : '\'';
2935 }
2936 while (size-- > 0) {
2937 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002938
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002939 /* Escape quotes and backslashes */
2940 if ((quotes &&
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002941 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 *p++ = '\\';
2943 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002944 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002945 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002946
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002947#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002948 /* Map 21-bit characters to '\U00xxxxxx' */
2949 else if (ch >= 0x10000) {
2950 *p++ = '\\';
2951 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002952 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2955 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2956 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2957 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2958 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002959 *p++ = hexdigit[ch & 0x0000000F];
2960 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002961 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002962#else
2963 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002964 else if (ch >= 0xD800 && ch < 0xDC00) {
2965 Py_UNICODE ch2;
2966 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002967
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968 ch2 = *s++;
2969 size--;
2970 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2971 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2972 *p++ = '\\';
2973 *p++ = 'U';
2974 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2977 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2978 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2979 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2980 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2981 *p++ = hexdigit[ucs & 0x0000000F];
2982 continue;
2983 }
2984 /* Fall through: isolated surrogates are copied as-is */
2985 s--;
2986 size++;
2987 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002988#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002991 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 *p++ = '\\';
2993 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002994 *p++ = hexdigit[(ch >> 12) & 0x000F];
2995 *p++ = hexdigit[(ch >> 8) & 0x000F];
2996 *p++ = hexdigit[(ch >> 4) & 0x000F];
2997 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002999
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003000 /* Map special whitespace to '\t', \n', '\r' */
3001 else if (ch == '\t') {
3002 *p++ = '\\';
3003 *p++ = 't';
3004 }
3005 else if (ch == '\n') {
3006 *p++ = '\\';
3007 *p++ = 'n';
3008 }
3009 else if (ch == '\r') {
3010 *p++ = '\\';
3011 *p++ = 'r';
3012 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003013
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003014 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003015 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003017 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003018 *p++ = hexdigit[(ch >> 4) & 0x000F];
3019 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 /* Copy everything else as-is */
3023 else
3024 *p++ = (char) ch;
3025 }
3026 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003030 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 return repr;
3032}
3033
3034PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003035 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
3037 return unicodeescape_string(s, size, 0);
3038}
3039
3040PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3041{
3042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3045 }
3046 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3047 PyUnicode_GET_SIZE(unicode));
3048}
3049
3050/* --- Raw Unicode Escape Codec ------------------------------------------- */
3051
3052PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003053 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 const char *errors)
3055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003057 Py_ssize_t startinpos;
3058 Py_ssize_t endinpos;
3059 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 const char *end;
3063 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003066
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 /* Escaped strings will always be longer than the resulting
3068 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 length after conversion to the true value. (But decoding error
3070 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 v = _PyUnicode_New(size);
3072 if (v == NULL)
3073 goto onError;
3074 if (size == 0)
3075 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 end = s + size;
3078 while (s < end) {
3079 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003080 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003082 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083
3084 /* Non-escape characters are interpreted as Unicode ordinals */
3085 if (*s != '\\') {
3086 *p++ = (unsigned char)*s++;
3087 continue;
3088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090
3091 /* \u-escapes are only interpreted iff the number of leading
3092 backslashes if odd */
3093 bs = s;
3094 for (;s < end;) {
3095 if (*s != '\\')
3096 break;
3097 *p++ = (unsigned char)*s++;
3098 }
3099 if (((s - bs) & 1) == 0 ||
3100 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003101 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 continue;
3103 }
3104 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003105 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 s++;
3107
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003108 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003110 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 endinpos = s-starts;
3114 if (unicode_decode_call_errorhandler(
3115 errors, &errorHandler,
3116 "rawunicodeescape", "truncated \\uXXXX",
3117 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003118 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 }
3122 x = (x<<4) & ~0xF;
3123 if (c >= '0' && c <= '9')
3124 x += c - '0';
3125 else if (c >= 'a' && c <= 'f')
3126 x += 10 + c - 'a';
3127 else
3128 x += 10 + c - 'A';
3129 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003130 if (x <= 0xffff)
3131 /* UCS-2 character */
3132 *p++ = (Py_UNICODE) x;
3133 else if (x <= 0x10ffff) {
3134 /* UCS-4 character. Either store directly, or as
3135 surrogate pair. */
3136#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003137 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003138#else
3139 x -= 0x10000L;
3140 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3141 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3142#endif
3143 } else {
3144 endinpos = s-starts;
3145 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003146 if (unicode_decode_call_errorhandler(
3147 errors, &errorHandler,
3148 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3149 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003150 &v, &outpos, &p))
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003151 goto onError;
3152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 nextByte:
3154 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003156 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003157 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 Py_XDECREF(errorHandler);
3159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003161
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 onError:
3163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 Py_XDECREF(errorHandler);
3165 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 return NULL;
3167}
3168
3169PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171{
3172 PyObject *repr;
3173 char *p;
3174 char *q;
3175
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003176 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003178 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003180 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00003182
3183 if (size > PY_SSIZE_T_MAX / expandsize)
3184 return PyErr_NoMemory();
3185
3186 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 if (repr == NULL)
3188 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003189 if (size == 0)
3190 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003192 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 while (size-- > 0) {
3194 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003195#ifdef Py_UNICODE_WIDE
3196 /* Map 32-bit characters to '\Uxxxxxxxx' */
3197 if (ch >= 0x10000) {
3198 *p++ = '\\';
3199 *p++ = 'U';
3200 *p++ = hexdigit[(ch >> 28) & 0xf];
3201 *p++ = hexdigit[(ch >> 24) & 0xf];
3202 *p++ = hexdigit[(ch >> 20) & 0xf];
3203 *p++ = hexdigit[(ch >> 16) & 0xf];
3204 *p++ = hexdigit[(ch >> 12) & 0xf];
3205 *p++ = hexdigit[(ch >> 8) & 0xf];
3206 *p++ = hexdigit[(ch >> 4) & 0xf];
3207 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003208 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003209 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003210#else
3211 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3212 if (ch >= 0xD800 && ch < 0xDC00) {
3213 Py_UNICODE ch2;
3214 Py_UCS4 ucs;
3215
3216 ch2 = *s++;
3217 size--;
3218 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3219 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3220 *p++ = '\\';
3221 *p++ = 'U';
3222 *p++ = hexdigit[(ucs >> 28) & 0xf];
3223 *p++ = hexdigit[(ucs >> 24) & 0xf];
3224 *p++ = hexdigit[(ucs >> 20) & 0xf];
3225 *p++ = hexdigit[(ucs >> 16) & 0xf];
3226 *p++ = hexdigit[(ucs >> 12) & 0xf];
3227 *p++ = hexdigit[(ucs >> 8) & 0xf];
3228 *p++ = hexdigit[(ucs >> 4) & 0xf];
3229 *p++ = hexdigit[ucs & 0xf];
3230 continue;
3231 }
3232 /* Fall through: isolated surrogates are copied as-is */
3233 s--;
3234 size++;
3235 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003236#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 /* Map 16-bit characters to '\uxxxx' */
3238 if (ch >= 256) {
3239 *p++ = '\\';
3240 *p++ = 'u';
3241 *p++ = hexdigit[(ch >> 12) & 0xf];
3242 *p++ = hexdigit[(ch >> 8) & 0xf];
3243 *p++ = hexdigit[(ch >> 4) & 0xf];
3244 *p++ = hexdigit[ch & 15];
3245 }
3246 /* Copy everything else as-is */
3247 else
3248 *p++ = (char) ch;
3249 }
3250 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003251 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 return repr;
3253}
3254
3255PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3256{
3257 if (!PyUnicode_Check(unicode)) {
3258 PyErr_BadArgument();
3259 return NULL;
3260 }
3261 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3262 PyUnicode_GET_SIZE(unicode));
3263}
3264
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003265/* --- Unicode Internal Codec ------------------------------------------- */
3266
3267PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003268 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003269 const char *errors)
3270{
3271 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003272 Py_ssize_t startinpos;
3273 Py_ssize_t endinpos;
3274 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003275 PyUnicodeObject *v;
3276 Py_UNICODE *p;
3277 const char *end;
3278 const char *reason;
3279 PyObject *errorHandler = NULL;
3280 PyObject *exc = NULL;
3281
Neal Norwitzd43069c2006-01-08 01:12:10 +00003282#ifdef Py_UNICODE_WIDE
3283 Py_UNICODE unimax = PyUnicode_GetMax();
3284#endif
3285
Armin Rigo7ccbca92006-10-04 12:17:45 +00003286 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003287 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3288 if (v == NULL)
3289 goto onError;
3290 if (PyUnicode_GetSize((PyObject *)v) == 0)
3291 return (PyObject *)v;
3292 p = PyUnicode_AS_UNICODE(v);
3293 end = s + size;
3294
3295 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003296 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003297 /* We have to sanity check the raw data, otherwise doom looms for
3298 some malformed UCS-4 data. */
3299 if (
3300 #ifdef Py_UNICODE_WIDE
3301 *p > unimax || *p < 0 ||
3302 #endif
3303 end-s < Py_UNICODE_SIZE
3304 )
3305 {
3306 startinpos = s - starts;
3307 if (end-s < Py_UNICODE_SIZE) {
3308 endinpos = end-starts;
3309 reason = "truncated input";
3310 }
3311 else {
3312 endinpos = s - starts + Py_UNICODE_SIZE;
3313 reason = "illegal code point (> 0x10FFFF)";
3314 }
3315 outpos = p - PyUnicode_AS_UNICODE(v);
3316 if (unicode_decode_call_errorhandler(
3317 errors, &errorHandler,
3318 "unicode_internal", reason,
3319 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003320 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003321 goto onError;
3322 }
3323 }
3324 else {
3325 p++;
3326 s += Py_UNICODE_SIZE;
3327 }
3328 }
3329
Martin v. Löwis412fb672006-04-13 06:34:32 +00003330 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003331 goto onError;
3332 Py_XDECREF(errorHandler);
3333 Py_XDECREF(exc);
3334 return (PyObject *)v;
3335
3336 onError:
3337 Py_XDECREF(v);
3338 Py_XDECREF(errorHandler);
3339 Py_XDECREF(exc);
3340 return NULL;
3341}
3342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343/* --- Latin-1 Codec ------------------------------------------------------ */
3344
3345PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003346 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 const char *errors)
3348{
3349 PyUnicodeObject *v;
3350 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003351
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003353 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003354 Py_UNICODE r = *(unsigned char*)s;
3355 return PyUnicode_FromUnicode(&r, 1);
3356 }
3357
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 v = _PyUnicode_New(size);
3359 if (v == NULL)
3360 goto onError;
3361 if (size == 0)
3362 return (PyObject *)v;
3363 p = PyUnicode_AS_UNICODE(v);
3364 while (size-- > 0)
3365 *p++ = (unsigned char)*s++;
3366 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 onError:
3369 Py_XDECREF(v);
3370 return NULL;
3371}
3372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373/* create or adjust a UnicodeEncodeError */
3374static void make_encode_exception(PyObject **exceptionObject,
3375 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003376 const Py_UNICODE *unicode, Py_ssize_t size,
3377 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380 if (*exceptionObject == NULL) {
3381 *exceptionObject = PyUnicodeEncodeError_Create(
3382 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 }
3384 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3386 goto onError;
3387 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3388 goto onError;
3389 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3390 goto onError;
3391 return;
3392 onError:
3393 Py_DECREF(*exceptionObject);
3394 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 }
3396}
3397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398/* raises a UnicodeEncodeError */
3399static void raise_encode_exception(PyObject **exceptionObject,
3400 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003401 const Py_UNICODE *unicode, Py_ssize_t size,
3402 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 const char *reason)
3404{
3405 make_encode_exception(exceptionObject,
3406 encoding, unicode, size, startpos, endpos, reason);
3407 if (*exceptionObject != NULL)
3408 PyCodec_StrictErrors(*exceptionObject);
3409}
3410
3411/* error handling callback helper:
3412 build arguments, call the callback and check the arguments,
3413 put the result into newpos and return the replacement string, which
3414 has to be freed by the caller */
3415static PyObject *unicode_encode_call_errorhandler(const char *errors,
3416 PyObject **errorHandler,
3417 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003418 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3419 Py_ssize_t startpos, Py_ssize_t endpos,
3420 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423
3424 PyObject *restuple;
3425 PyObject *resunicode;
3426
3427 if (*errorHandler == NULL) {
3428 *errorHandler = PyCodec_LookupError(errors);
3429 if (*errorHandler == NULL)
3430 return NULL;
3431 }
3432
3433 make_encode_exception(exceptionObject,
3434 encoding, unicode, size, startpos, endpos, reason);
3435 if (*exceptionObject == NULL)
3436 return NULL;
3437
3438 restuple = PyObject_CallFunctionObjArgs(
3439 *errorHandler, *exceptionObject, NULL);
3440 if (restuple == NULL)
3441 return NULL;
3442 if (!PyTuple_Check(restuple)) {
3443 PyErr_Format(PyExc_TypeError, &argparse[4]);
3444 Py_DECREF(restuple);
3445 return NULL;
3446 }
3447 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3448 &resunicode, newpos)) {
3449 Py_DECREF(restuple);
3450 return NULL;
3451 }
3452 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003453 *newpos = size+*newpos;
3454 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003456 Py_DECREF(restuple);
3457 return NULL;
3458 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 Py_INCREF(resunicode);
3460 Py_DECREF(restuple);
3461 return resunicode;
3462}
3463
3464static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003465 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 const char *errors,
3467 int limit)
3468{
3469 /* output object */
3470 PyObject *res;
3471 /* pointers to the beginning and end+1 of input */
3472 const Py_UNICODE *startp = p;
3473 const Py_UNICODE *endp = p + size;
3474 /* pointer to the beginning of the unencodable characters */
3475 /* const Py_UNICODE *badp = NULL; */
3476 /* pointer into the output */
3477 char *str;
3478 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003479 Py_ssize_t respos = 0;
3480 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003481 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3482 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 PyObject *errorHandler = NULL;
3484 PyObject *exc = NULL;
3485 /* the following variable is used for caching string comparisons
3486 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3487 int known_errorHandler = -1;
3488
3489 /* allocate enough for a simple encoding without
3490 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003491 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 if (res == NULL)
3493 goto onError;
3494 if (size == 0)
3495 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003496 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 ressize = size;
3498
3499 while (p<endp) {
3500 Py_UNICODE c = *p;
3501
3502 /* can we encode this? */
3503 if (c<limit) {
3504 /* no overflow check, because we know that the space is enough */
3505 *str++ = (char)c;
3506 ++p;
3507 }
3508 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003509 Py_ssize_t unicodepos = p-startp;
3510 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t repsize;
3513 Py_ssize_t newpos;
3514 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 Py_UNICODE *uni2;
3516 /* startpos for collecting unencodable chars */
3517 const Py_UNICODE *collstart = p;
3518 const Py_UNICODE *collend = p;
3519 /* find all unecodable characters */
3520 while ((collend < endp) && ((*collend)>=limit))
3521 ++collend;
3522 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3523 if (known_errorHandler==-1) {
3524 if ((errors==NULL) || (!strcmp(errors, "strict")))
3525 known_errorHandler = 1;
3526 else if (!strcmp(errors, "replace"))
3527 known_errorHandler = 2;
3528 else if (!strcmp(errors, "ignore"))
3529 known_errorHandler = 3;
3530 else if (!strcmp(errors, "xmlcharrefreplace"))
3531 known_errorHandler = 4;
3532 else
3533 known_errorHandler = 0;
3534 }
3535 switch (known_errorHandler) {
3536 case 1: /* strict */
3537 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3538 goto onError;
3539 case 2: /* replace */
3540 while (collstart++<collend)
3541 *str++ = '?'; /* fall through */
3542 case 3: /* ignore */
3543 p = collend;
3544 break;
3545 case 4: /* xmlcharrefreplace */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003546 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 /* determine replacement size (temporarily (mis)uses p) */
3548 for (p = collstart, repsize = 0; p < collend; ++p) {
3549 if (*p<10)
3550 repsize += 2+1+1;
3551 else if (*p<100)
3552 repsize += 2+2+1;
3553 else if (*p<1000)
3554 repsize += 2+3+1;
3555 else if (*p<10000)
3556 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003557#ifndef Py_UNICODE_WIDE
3558 else
3559 repsize += 2+5+1;
3560#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 else if (*p<100000)
3562 repsize += 2+5+1;
3563 else if (*p<1000000)
3564 repsize += 2+6+1;
3565 else
3566 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003567#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 }
3569 requiredsize = respos+repsize+(endp-collend);
3570 if (requiredsize > ressize) {
3571 if (requiredsize<2*ressize)
3572 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003573 if (_PyString_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003575 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 ressize = requiredsize;
3577 }
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 str += sprintf(str, "&#%d;", (int)*p);
3581 }
3582 p = collend;
3583 break;
3584 default:
3585 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3586 encoding, reason, startp, size, &exc,
3587 collstart-startp, collend-startp, &newpos);
3588 if (repunicode == NULL)
3589 goto onError;
3590 /* need more space? (at least enough for what we
3591 have+the replacement+the rest of the string, so
3592 we won't have to check space for encodable characters) */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003593 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 repsize = PyUnicode_GET_SIZE(repunicode);
3595 requiredsize = respos+repsize+(endp-collend);
3596 if (requiredsize > ressize) {
3597 if (requiredsize<2*ressize)
3598 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003599 if (_PyString_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 Py_DECREF(repunicode);
3601 goto onError;
3602 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003603 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 ressize = requiredsize;
3605 }
3606 /* check if there is anything unencodable in the replacement
3607 and copy it to the output */
3608 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3609 c = *uni2;
3610 if (c >= limit) {
3611 raise_encode_exception(&exc, encoding, startp, size,
3612 unicodepos, unicodepos+1, reason);
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 *str = (char)c;
3617 }
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
3620 }
3621 }
3622 }
3623 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003624 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 if (respos<ressize)
3626 /* If this falls res will be NULL */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003627 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
3630 return res;
3631
3632 onError:
3633 Py_XDECREF(res);
3634 Py_XDECREF(errorHandler);
3635 Py_XDECREF(exc);
3636 return NULL;
3637}
3638
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003640 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 const char *errors)
3642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644}
3645
3646PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3647{
3648 if (!PyUnicode_Check(unicode)) {
3649 PyErr_BadArgument();
3650 return NULL;
3651 }
3652 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3653 PyUnicode_GET_SIZE(unicode),
3654 NULL);
3655}
3656
3657/* --- 7-bit ASCII Codec -------------------------------------------------- */
3658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003660 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 const char *errors)
3662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 PyUnicodeObject *v;
3665 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t startinpos;
3667 Py_ssize_t endinpos;
3668 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 const char *e;
3670 PyObject *errorHandler = NULL;
3671 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003674 if (size == 1 && *(unsigned char*)s < 128) {
3675 Py_UNICODE r = *(unsigned char*)s;
3676 return PyUnicode_FromUnicode(&r, 1);
3677 }
Tim Petersced69f82003-09-16 20:30:58 +00003678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 v = _PyUnicode_New(size);
3680 if (v == NULL)
3681 goto onError;
3682 if (size == 0)
3683 return (PyObject *)v;
3684 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 e = s + size;
3686 while (s < e) {
3687 register unsigned char c = (unsigned char)*s;
3688 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 ++s;
3691 }
3692 else {
3693 startinpos = s-starts;
3694 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003695 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 if (unicode_decode_call_errorhandler(
3697 errors, &errorHandler,
3698 "ascii", "ordinal not in range(128)",
3699 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003700 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003704 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003705 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003706 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 Py_XDECREF(errorHandler);
3708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 onError:
3712 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 Py_XDECREF(errorHandler);
3714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 return NULL;
3716}
3717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003719 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 const char *errors)
3721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723}
3724
3725PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3726{
3727 if (!PyUnicode_Check(unicode)) {
3728 PyErr_BadArgument();
3729 return NULL;
3730 }
3731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3732 PyUnicode_GET_SIZE(unicode),
3733 NULL);
3734}
3735
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003736#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003737
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003738/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003739
Martin v. Löwisd8251432006-06-14 05:21:04 +00003740#if SIZEOF_INT < SIZEOF_SSIZE_T
3741#define NEED_RETRY
3742#endif
3743
3744/* XXX This code is limited to "true" double-byte encodings, as
3745 a) it assumes an incomplete character consists of a single byte, and
3746 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3747 encodings, see IsDBCSLeadByteEx documentation. */
3748
3749static int is_dbcs_lead_byte(const char *s, int offset)
3750{
3751 const char *curr = s + offset;
3752
3753 if (IsDBCSLeadByte(*curr)) {
3754 const char *prev = CharPrev(s, curr);
3755 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3756 }
3757 return 0;
3758}
3759
3760/*
3761 * Decode MBCS string into unicode object. If 'final' is set, converts
3762 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3763 */
3764static int decode_mbcs(PyUnicodeObject **v,
3765 const char *s, /* MBCS string */
3766 int size, /* sizeof MBCS string */
3767 int final)
3768{
3769 Py_UNICODE *p;
3770 Py_ssize_t n = 0;
3771 int usize = 0;
3772
3773 assert(size >= 0);
3774
3775 /* Skip trailing lead-byte unless 'final' is set */
3776 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3777 --size;
3778
3779 /* First get the size of the result */
3780 if (size > 0) {
3781 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3782 if (usize == 0) {
3783 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3784 return -1;
3785 }
3786 }
3787
3788 if (*v == NULL) {
3789 /* Create unicode object */
3790 *v = _PyUnicode_New(usize);
3791 if (*v == NULL)
3792 return -1;
3793 }
3794 else {
3795 /* Extend unicode object */
3796 n = PyUnicode_GET_SIZE(*v);
3797 if (_PyUnicode_Resize(v, n + usize) < 0)
3798 return -1;
3799 }
3800
3801 /* Do the conversion */
3802 if (size > 0) {
3803 p = PyUnicode_AS_UNICODE(*v) + n;
3804 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3805 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3806 return -1;
3807 }
3808 }
3809
3810 return size;
3811}
3812
3813PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3814 Py_ssize_t size,
3815 const char *errors,
3816 Py_ssize_t *consumed)
3817{
3818 PyUnicodeObject *v = NULL;
3819 int done;
3820
3821 if (consumed)
3822 *consumed = 0;
3823
3824#ifdef NEED_RETRY
3825 retry:
3826 if (size > INT_MAX)
3827 done = decode_mbcs(&v, s, INT_MAX, 0);
3828 else
3829#endif
3830 done = decode_mbcs(&v, s, (int)size, !consumed);
3831
3832 if (done < 0) {
3833 Py_XDECREF(v);
3834 return NULL;
3835 }
3836
3837 if (consumed)
3838 *consumed += done;
3839
3840#ifdef NEED_RETRY
3841 if (size > INT_MAX) {
3842 s += done;
3843 size -= done;
3844 goto retry;
3845 }
3846#endif
3847
3848 return (PyObject *)v;
3849}
3850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003851PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003852 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003853 const char *errors)
3854{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003855 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3856}
3857
3858/*
3859 * Convert unicode into string object (MBCS).
3860 * Returns 0 if succeed, -1 otherwise.
3861 */
3862static int encode_mbcs(PyObject **repr,
3863 const Py_UNICODE *p, /* unicode */
3864 int size) /* size of unicode */
3865{
3866 int mbcssize = 0;
3867 Py_ssize_t n = 0;
3868
3869 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003870
3871 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003872 if (size > 0) {
3873 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3874 if (mbcssize == 0) {
3875 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3876 return -1;
3877 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003878 }
3879
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880 if (*repr == NULL) {
3881 /* Create string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003882 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003883 if (*repr == NULL)
3884 return -1;
3885 }
3886 else {
3887 /* Extend string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003888 n = PyString_Size(*repr);
3889 if (_PyString_Resize(repr, n + mbcssize) < 0)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003890 return -1;
3891 }
3892
3893 /* Do the conversion */
3894 if (size > 0) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003895 char *s = PyString_AS_STRING(*repr) + n;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003896 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3897 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3898 return -1;
3899 }
3900 }
3901
3902 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903}
3904
3905PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003906 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003907 const char *errors)
3908{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003909 PyObject *repr = NULL;
3910 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003911
Martin v. Löwisd8251432006-06-14 05:21:04 +00003912#ifdef NEED_RETRY
3913 retry:
3914 if (size > INT_MAX)
3915 ret = encode_mbcs(&repr, p, INT_MAX);
3916 else
3917#endif
3918 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920 if (ret < 0) {
3921 Py_XDECREF(repr);
3922 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003923 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924
3925#ifdef NEED_RETRY
3926 if (size > INT_MAX) {
3927 p += INT_MAX;
3928 size -= INT_MAX;
3929 goto retry;
3930 }
3931#endif
3932
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003933 return repr;
3934}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003935
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003936PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3937{
3938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 return NULL;
3941 }
3942 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3943 PyUnicode_GET_SIZE(unicode),
3944 NULL);
3945}
3946
Martin v. Löwisd8251432006-06-14 05:21:04 +00003947#undef NEED_RETRY
3948
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003949#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951/* --- Character Mapping Codec -------------------------------------------- */
3952
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 PyObject *mapping,
3956 const char *errors)
3957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t startinpos;
3960 Py_ssize_t endinpos;
3961 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 PyUnicodeObject *v;
3964 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003965 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 PyObject *errorHandler = NULL;
3967 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003968 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003969 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003970
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 /* Default to Latin-1 */
3972 if (mapping == NULL)
3973 return PyUnicode_DecodeLatin1(s, size, errors);
3974
3975 v = _PyUnicode_New(size);
3976 if (v == NULL)
3977 goto onError;
3978 if (size == 0)
3979 return (PyObject *)v;
3980 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003982 if (PyUnicode_CheckExact(mapping)) {
3983 mapstring = PyUnicode_AS_UNICODE(mapping);
3984 maplen = PyUnicode_GET_SIZE(mapping);
3985 while (s < e) {
3986 unsigned char ch = *s;
3987 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003989 if (ch < maplen)
3990 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003992 if (x == 0xfffe) {
3993 /* undefined mapping */
3994 outpos = p-PyUnicode_AS_UNICODE(v);
3995 startinpos = s-starts;
3996 endinpos = startinpos+1;
3997 if (unicode_decode_call_errorhandler(
3998 errors, &errorHandler,
3999 "charmap", "character maps to <undefined>",
4000 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00004001 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004002 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004003 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004004 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004005 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004006 *p++ = x;
4007 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004009 }
4010 else {
4011 while (s < e) {
4012 unsigned char ch = *s;
4013 PyObject *w, *x;
4014
4015 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4016 w = PyInt_FromLong((long)ch);
4017 if (w == NULL)
4018 goto onError;
4019 x = PyObject_GetItem(mapping, w);
4020 Py_DECREF(w);
4021 if (x == NULL) {
4022 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4023 /* No mapping found means: mapping is undefined. */
4024 PyErr_Clear();
4025 x = Py_None;
4026 Py_INCREF(x);
4027 } else
4028 goto onError;
4029 }
4030
4031 /* Apply mapping */
4032 if (PyInt_Check(x)) {
4033 long value = PyInt_AS_LONG(x);
4034 if (value < 0 || value > 65535) {
4035 PyErr_SetString(PyExc_TypeError,
4036 "character mapping must be in range(65536)");
4037 Py_DECREF(x);
4038 goto onError;
4039 }
4040 *p++ = (Py_UNICODE)value;
4041 }
4042 else if (x == Py_None) {
4043 /* undefined mapping */
4044 outpos = p-PyUnicode_AS_UNICODE(v);
4045 startinpos = s-starts;
4046 endinpos = startinpos+1;
4047 if (unicode_decode_call_errorhandler(
4048 errors, &errorHandler,
4049 "charmap", "character maps to <undefined>",
4050 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00004051 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004052 Py_DECREF(x);
4053 goto onError;
4054 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004055 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004056 continue;
4057 }
4058 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004060
4061 if (targetsize == 1)
4062 /* 1-1 mapping */
4063 *p++ = *PyUnicode_AS_UNICODE(x);
4064
4065 else if (targetsize > 1) {
4066 /* 1-n mapping */
4067 if (targetsize > extrachars) {
4068 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004069 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4070 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004071 (targetsize << 2);
4072 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004073 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004074 if (_PyUnicode_Resize(&v,
4075 PyUnicode_GET_SIZE(v) + needed) < 0) {
4076 Py_DECREF(x);
4077 goto onError;
4078 }
4079 p = PyUnicode_AS_UNICODE(v) + oldpos;
4080 }
4081 Py_UNICODE_COPY(p,
4082 PyUnicode_AS_UNICODE(x),
4083 targetsize);
4084 p += targetsize;
4085 extrachars -= targetsize;
4086 }
4087 /* 1-0 mapping: skip the character */
4088 }
4089 else {
4090 /* wrong return value */
4091 PyErr_SetString(PyExc_TypeError,
4092 "character mapping must return integer, None or unicode");
4093 Py_DECREF(x);
4094 goto onError;
4095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004097 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 }
4100 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 Py_XDECREF(errorHandler);
4104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 Py_XDECREF(v);
4111 return NULL;
4112}
4113
Martin v. Löwis3f767792006-06-04 19:36:28 +00004114/* Charmap encoding: the lookup table */
4115
4116struct encoding_map{
4117 PyObject_HEAD
4118 unsigned char level1[32];
4119 int count2, count3;
4120 unsigned char level23[1];
4121};
4122
4123static PyObject*
4124encoding_map_size(PyObject *obj, PyObject* args)
4125{
4126 struct encoding_map *map = (struct encoding_map*)obj;
4127 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4128 128*map->count3);
4129}
4130
4131static PyMethodDef encoding_map_methods[] = {
4132 {"size", encoding_map_size, METH_NOARGS,
4133 PyDoc_STR("Return the size (in bytes) of this object") },
4134 { 0 }
4135};
4136
4137static void
4138encoding_map_dealloc(PyObject* o)
4139{
4140 PyObject_FREE(o);
4141}
4142
4143static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004144 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004145 "EncodingMap", /*tp_name*/
4146 sizeof(struct encoding_map), /*tp_basicsize*/
4147 0, /*tp_itemsize*/
4148 /* methods */
4149 encoding_map_dealloc, /*tp_dealloc*/
4150 0, /*tp_print*/
4151 0, /*tp_getattr*/
4152 0, /*tp_setattr*/
4153 0, /*tp_compare*/
4154 0, /*tp_repr*/
4155 0, /*tp_as_number*/
4156 0, /*tp_as_sequence*/
4157 0, /*tp_as_mapping*/
4158 0, /*tp_hash*/
4159 0, /*tp_call*/
4160 0, /*tp_str*/
4161 0, /*tp_getattro*/
4162 0, /*tp_setattro*/
4163 0, /*tp_as_buffer*/
4164 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4165 0, /*tp_doc*/
4166 0, /*tp_traverse*/
4167 0, /*tp_clear*/
4168 0, /*tp_richcompare*/
4169 0, /*tp_weaklistoffset*/
4170 0, /*tp_iter*/
4171 0, /*tp_iternext*/
4172 encoding_map_methods, /*tp_methods*/
4173 0, /*tp_members*/
4174 0, /*tp_getset*/
4175 0, /*tp_base*/
4176 0, /*tp_dict*/
4177 0, /*tp_descr_get*/
4178 0, /*tp_descr_set*/
4179 0, /*tp_dictoffset*/
4180 0, /*tp_init*/
4181 0, /*tp_alloc*/
4182 0, /*tp_new*/
4183 0, /*tp_free*/
4184 0, /*tp_is_gc*/
4185};
4186
4187PyObject*
4188PyUnicode_BuildEncodingMap(PyObject* string)
4189{
4190 Py_UNICODE *decode;
4191 PyObject *result;
4192 struct encoding_map *mresult;
4193 int i;
4194 int need_dict = 0;
4195 unsigned char level1[32];
4196 unsigned char level2[512];
4197 unsigned char *mlevel1, *mlevel2, *mlevel3;
4198 int count2 = 0, count3 = 0;
4199
4200 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4201 PyErr_BadArgument();
4202 return NULL;
4203 }
4204 decode = PyUnicode_AS_UNICODE(string);
4205 memset(level1, 0xFF, sizeof level1);
4206 memset(level2, 0xFF, sizeof level2);
4207
4208 /* If there isn't a one-to-one mapping of NULL to \0,
4209 or if there are non-BMP characters, we need to use
4210 a mapping dictionary. */
4211 if (decode[0] != 0)
4212 need_dict = 1;
4213 for (i = 1; i < 256; i++) {
4214 int l1, l2;
4215 if (decode[i] == 0
4216 #ifdef Py_UNICODE_WIDE
4217 || decode[i] > 0xFFFF
4218 #endif
4219 ) {
4220 need_dict = 1;
4221 break;
4222 }
4223 if (decode[i] == 0xFFFE)
4224 /* unmapped character */
4225 continue;
4226 l1 = decode[i] >> 11;
4227 l2 = decode[i] >> 7;
4228 if (level1[l1] == 0xFF)
4229 level1[l1] = count2++;
4230 if (level2[l2] == 0xFF)
4231 level2[l2] = count3++;
4232 }
4233
4234 if (count2 >= 0xFF || count3 >= 0xFF)
4235 need_dict = 1;
4236
4237 if (need_dict) {
4238 PyObject *result = PyDict_New();
4239 PyObject *key, *value;
4240 if (!result)
4241 return NULL;
4242 for (i = 0; i < 256; i++) {
4243 key = value = NULL;
4244 key = PyInt_FromLong(decode[i]);
4245 value = PyInt_FromLong(i);
4246 if (!key || !value)
4247 goto failed1;
4248 if (PyDict_SetItem(result, key, value) == -1)
4249 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004250 Py_DECREF(key);
4251 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004252 }
4253 return result;
4254 failed1:
4255 Py_XDECREF(key);
4256 Py_XDECREF(value);
4257 Py_DECREF(result);
4258 return NULL;
4259 }
4260
4261 /* Create a three-level trie */
4262 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4263 16*count2 + 128*count3 - 1);
4264 if (!result)
4265 return PyErr_NoMemory();
4266 PyObject_Init(result, &EncodingMapType);
4267 mresult = (struct encoding_map*)result;
4268 mresult->count2 = count2;
4269 mresult->count3 = count3;
4270 mlevel1 = mresult->level1;
4271 mlevel2 = mresult->level23;
4272 mlevel3 = mresult->level23 + 16*count2;
4273 memcpy(mlevel1, level1, 32);
4274 memset(mlevel2, 0xFF, 16*count2);
4275 memset(mlevel3, 0, 128*count3);
4276 count3 = 0;
4277 for (i = 1; i < 256; i++) {
4278 int o1, o2, o3, i2, i3;
4279 if (decode[i] == 0xFFFE)
4280 /* unmapped character */
4281 continue;
4282 o1 = decode[i]>>11;
4283 o2 = (decode[i]>>7) & 0xF;
4284 i2 = 16*mlevel1[o1] + o2;
4285 if (mlevel2[i2] == 0xFF)
4286 mlevel2[i2] = count3++;
4287 o3 = decode[i] & 0x7F;
4288 i3 = 128*mlevel2[i2] + o3;
4289 mlevel3[i3] = i;
4290 }
4291 return result;
4292}
4293
4294static int
4295encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4296{
4297 struct encoding_map *map = (struct encoding_map*)mapping;
4298 int l1 = c>>11;
4299 int l2 = (c>>7) & 0xF;
4300 int l3 = c & 0x7F;
4301 int i;
4302
4303#ifdef Py_UNICODE_WIDE
4304 if (c > 0xFFFF) {
4305 return -1;
4306 }
4307#endif
4308 if (c == 0)
4309 return 0;
4310 /* level 1*/
4311 i = map->level1[l1];
4312 if (i == 0xFF) {
4313 return -1;
4314 }
4315 /* level 2*/
4316 i = map->level23[16*i+l2];
4317 if (i == 0xFF) {
4318 return -1;
4319 }
4320 /* level 3 */
4321 i = map->level23[16*map->count2 + 128*i + l3];
4322 if (i == 0) {
4323 return -1;
4324 }
4325 return i;
4326}
4327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328/* Lookup the character ch in the mapping. If the character
4329 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004330 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 PyObject *w = PyInt_FromLong((long)c);
4334 PyObject *x;
4335
4336 if (w == NULL)
4337 return NULL;
4338 x = PyObject_GetItem(mapping, w);
4339 Py_DECREF(w);
4340 if (x == NULL) {
4341 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4342 /* No mapping found means: mapping is undefined. */
4343 PyErr_Clear();
4344 x = Py_None;
4345 Py_INCREF(x);
4346 return x;
4347 } else
4348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004350 else if (x == Py_None)
4351 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 else if (PyInt_Check(x)) {
4353 long value = PyInt_AS_LONG(x);
4354 if (value < 0 || value > 255) {
4355 PyErr_SetString(PyExc_TypeError,
4356 "character mapping must be in range(256)");
4357 Py_DECREF(x);
4358 return NULL;
4359 }
4360 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004362 else if (PyString_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365 /* wrong return value */
4366 PyErr_SetString(PyExc_TypeError,
4367 "character mapping must return integer, None or str");
4368 Py_DECREF(x);
4369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
4371}
4372
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373static int
4374charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4375{
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004376 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004377 /* exponentially overallocate to minimize reallocations */
4378 if (requiredsize < 2*outsize)
4379 requiredsize = 2*outsize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004380 if (_PyString_Resize(outobj, requiredsize)) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004381 return 0;
4382 }
4383 return 1;
4384}
4385
4386typedef enum charmapencode_result {
4387 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4388}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389/* lookup the character, put the result in the output string and adjust
4390 various state variables. Reallocate the output string if not enough
4391 space is available. Return a new reference to the object that
4392 was put in the output buffer, or Py_None, if the mapping was undefined
4393 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004394 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 PyObject *rep;
4400 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004401 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402
Christian Heimese93237d2007-12-19 02:37:44 +00004403 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004404 int res = encoding_map_lookup(c, mapping);
4405 Py_ssize_t requiredsize = *outpos+1;
4406 if (res == -1)
4407 return enc_FAILED;
4408 if (outsize<requiredsize)
4409 if (!charmapencode_resize(outobj, outpos, requiredsize))
4410 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004411 outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004412 outstart[(*outpos)++] = (char)res;
4413 return enc_SUCCESS;
4414 }
4415
4416 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004418 return enc_EXCEPTION;
4419 else if (rep==Py_None) {
4420 Py_DECREF(rep);
4421 return enc_FAILED;
4422 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004425 if (outsize<requiredsize)
4426 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004428 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004430 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4432 }
4433 else {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004434 const char *repchars = PyString_AS_STRING(rep);
4435 Py_ssize_t repsize = PyString_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004437 if (outsize<requiredsize)
4438 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004440 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004442 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 memcpy(outstart + *outpos, repchars, repsize);
4444 *outpos += repsize;
4445 }
4446 }
Georg Brandl9f167602006-06-04 21:46:16 +00004447 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004448 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449}
4450
4451/* handle an error in PyUnicode_EncodeCharmap
4452 Return 0 on success, -1 on error */
4453static
4454int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004457 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004458 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459{
4460 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t repsize;
4462 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 Py_UNICODE *uni2;
4464 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t collstartpos = *inpos;
4466 Py_ssize_t collendpos = *inpos+1;
4467 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 char *encoding = "charmap";
4469 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004470 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 /* find all unencodable characters */
4473 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004474 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004475 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004476 int res = encoding_map_lookup(p[collendpos], mapping);
4477 if (res != -1)
4478 break;
4479 ++collendpos;
4480 continue;
4481 }
4482
4483 rep = charmapencode_lookup(p[collendpos], mapping);
4484 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004486 else if (rep!=Py_None) {
4487 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 break;
4489 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 ++collendpos;
4492 }
4493 /* cache callback name lookup
4494 * (if not done yet, i.e. it's the first error) */
4495 if (*known_errorHandler==-1) {
4496 if ((errors==NULL) || (!strcmp(errors, "strict")))
4497 *known_errorHandler = 1;
4498 else if (!strcmp(errors, "replace"))
4499 *known_errorHandler = 2;
4500 else if (!strcmp(errors, "ignore"))
4501 *known_errorHandler = 3;
4502 else if (!strcmp(errors, "xmlcharrefreplace"))
4503 *known_errorHandler = 4;
4504 else
4505 *known_errorHandler = 0;
4506 }
4507 switch (*known_errorHandler) {
4508 case 1: /* strict */
4509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510 return -1;
4511 case 2: /* replace */
4512 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4513 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004514 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 return -1;
4516 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004517 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4519 return -1;
4520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 }
4522 /* fall through */
4523 case 3: /* ignore */
4524 *inpos = collendpos;
4525 break;
4526 case 4: /* xmlcharrefreplace */
4527 /* generate replacement (temporarily (mis)uses p) */
4528 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4529 char buffer[2+29+1+1];
4530 char *cp;
4531 sprintf(buffer, "&#%d;", (int)p[collpos]);
4532 for (cp = buffer; *cp; ++cp) {
4533 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004534 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004536 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4538 return -1;
4539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 }
4541 }
4542 *inpos = collendpos;
4543 break;
4544 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004545 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 encoding, reason, p, size, exceptionObject,
4547 collstartpos, collendpos, &newpos);
4548 if (repunicode == NULL)
4549 return -1;
4550 /* generate replacement */
4551 repsize = PyUnicode_GET_SIZE(repunicode);
4552 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4553 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004554 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 return -1;
4556 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004557 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4560 return -1;
4561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 }
4563 *inpos = newpos;
4564 Py_DECREF(repunicode);
4565 }
4566 return 0;
4567}
4568
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004570 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 PyObject *mapping,
4572 const char *errors)
4573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* output object */
4575 PyObject *res = NULL;
4576 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004579 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 PyObject *errorHandler = NULL;
4581 PyObject *exc = NULL;
4582 /* the following variable is used for caching string comparisons
4583 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4584 * 3=ignore, 4=xmlcharrefreplace */
4585 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586
4587 /* Default to Latin-1 */
4588 if (mapping == NULL)
4589 return PyUnicode_EncodeLatin1(p, size, errors);
4590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 /* allocate enough for a simple encoding without
4592 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004593 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (res == NULL)
4595 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004596 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 while (inpos<size) {
4600 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004601 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4602 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004604 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 if (charmap_encoding_error(p, size, &inpos, mapping,
4606 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004607 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004608 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004609 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 else
4613 /* done with this character => adjust input position */
4614 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004618 if (respos<PyString_GET_SIZE(res)) {
4619 if (_PyString_Resize(&res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 goto onError;
4621 }
4622 Py_XDECREF(exc);
4623 Py_XDECREF(errorHandler);
4624 return res;
4625
4626 onError:
4627 Py_XDECREF(res);
4628 Py_XDECREF(exc);
4629 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 return NULL;
4631}
4632
4633PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4634 PyObject *mapping)
4635{
4636 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4637 PyErr_BadArgument();
4638 return NULL;
4639 }
4640 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4641 PyUnicode_GET_SIZE(unicode),
4642 mapping,
4643 NULL);
4644}
4645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646/* create or adjust a UnicodeTranslateError */
4647static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 const Py_UNICODE *unicode, Py_ssize_t size,
4649 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 if (*exceptionObject == NULL) {
4653 *exceptionObject = PyUnicodeTranslateError_Create(
4654 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 }
4656 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4658 goto onError;
4659 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4660 goto onError;
4661 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4662 goto onError;
4663 return;
4664 onError:
4665 Py_DECREF(*exceptionObject);
4666 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
4668}
4669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670/* raises a UnicodeTranslateError */
4671static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004672 const Py_UNICODE *unicode, Py_ssize_t size,
4673 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 const char *reason)
4675{
4676 make_translate_exception(exceptionObject,
4677 unicode, size, startpos, endpos, reason);
4678 if (*exceptionObject != NULL)
4679 PyCodec_StrictErrors(*exceptionObject);
4680}
4681
4682/* error handling callback helper:
4683 build arguments, call the callback and check the arguments,
4684 put the result into newpos and return the replacement string, which
4685 has to be freed by the caller */
4686static PyObject *unicode_translate_call_errorhandler(const char *errors,
4687 PyObject **errorHandler,
4688 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004689 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4690 Py_ssize_t startpos, Py_ssize_t endpos,
4691 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004693 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694
Martin v. Löwis412fb672006-04-13 06:34:32 +00004695 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 PyObject *restuple;
4697 PyObject *resunicode;
4698
4699 if (*errorHandler == NULL) {
4700 *errorHandler = PyCodec_LookupError(errors);
4701 if (*errorHandler == NULL)
4702 return NULL;
4703 }
4704
4705 make_translate_exception(exceptionObject,
4706 unicode, size, startpos, endpos, reason);
4707 if (*exceptionObject == NULL)
4708 return NULL;
4709
4710 restuple = PyObject_CallFunctionObjArgs(
4711 *errorHandler, *exceptionObject, NULL);
4712 if (restuple == NULL)
4713 return NULL;
4714 if (!PyTuple_Check(restuple)) {
4715 PyErr_Format(PyExc_TypeError, &argparse[4]);
4716 Py_DECREF(restuple);
4717 return NULL;
4718 }
4719 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_DECREF(restuple);
4722 return NULL;
4723 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004724 if (i_newpos<0)
4725 *newpos = size+i_newpos;
4726 else
4727 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004728 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004729 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004730 Py_DECREF(restuple);
4731 return NULL;
4732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_INCREF(resunicode);
4734 Py_DECREF(restuple);
4735 return resunicode;
4736}
4737
4738/* Lookup the character ch in the mapping and put the result in result,
4739 which must be decrefed by the caller.
4740 Return 0 on success, -1 on error */
4741static
4742int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4743{
4744 PyObject *w = PyInt_FromLong((long)c);
4745 PyObject *x;
4746
4747 if (w == NULL)
4748 return -1;
4749 x = PyObject_GetItem(mapping, w);
4750 Py_DECREF(w);
4751 if (x == NULL) {
4752 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4753 /* No mapping found means: use 1:1 mapping. */
4754 PyErr_Clear();
4755 *result = NULL;
4756 return 0;
4757 } else
4758 return -1;
4759 }
4760 else if (x == Py_None) {
4761 *result = x;
4762 return 0;
4763 }
4764 else if (PyInt_Check(x)) {
4765 long value = PyInt_AS_LONG(x);
4766 long max = PyUnicode_GetMax();
4767 if (value < 0 || value > max) {
4768 PyErr_Format(PyExc_TypeError,
4769 "character mapping must be in range(0x%lx)", max+1);
4770 Py_DECREF(x);
4771 return -1;
4772 }
4773 *result = x;
4774 return 0;
4775 }
4776 else if (PyUnicode_Check(x)) {
4777 *result = x;
4778 return 0;
4779 }
4780 else {
4781 /* wrong return value */
4782 PyErr_SetString(PyExc_TypeError,
4783 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004784 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 return -1;
4786 }
4787}
4788/* ensure that *outobj is at least requiredsize characters long,
4789if not reallocate and adjust various state variables.
4790Return 0 on success, -1 on error */
4791static
Walter Dörwald4894c302003-10-24 14:25:28 +00004792int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004793 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004795 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004798 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004800 if (requiredsize < 2 * oldsize)
4801 requiredsize = 2 * oldsize;
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00004802 if (PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 return -1;
4804 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 }
4806 return 0;
4807}
4808/* lookup the character, put the result in the output string and adjust
4809 various state variables. Return a new reference to the object that
4810 was put in the output buffer in *result, or Py_None, if the mapping was
4811 undefined (in which case no character was written).
4812 The called must decref result.
4813 Return 0 on success, -1 on error. */
4814static
Walter Dörwald4894c302003-10-24 14:25:28 +00004815int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004817 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818{
Walter Dörwald4894c302003-10-24 14:25:28 +00004819 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 return -1;
4821 if (*res==NULL) {
4822 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004823 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 else if (*res==Py_None)
4826 ;
4827 else if (PyInt_Check(*res)) {
4828 /* no overflow check, because we know that the space is enough */
4829 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4830 }
4831 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 if (repsize==1) {
4834 /* no overflow check, because we know that the space is enough */
4835 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4836 }
4837 else if (repsize!=0) {
4838 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004839 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004840 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004841 repsize - 1;
4842 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 return -1;
4844 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4845 *outp += repsize;
4846 }
4847 }
4848 else
4849 return -1;
4850 return 0;
4851}
4852
4853PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 PyObject *mapping,
4856 const char *errors)
4857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 /* output object */
4859 PyObject *res = NULL;
4860 /* pointers to the beginning and end+1 of input */
4861 const Py_UNICODE *startp = p;
4862 const Py_UNICODE *endp = p + size;
4863 /* pointer into the output */
4864 Py_UNICODE *str;
4865 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 char *reason = "character maps to <undefined>";
4868 PyObject *errorHandler = NULL;
4869 PyObject *exc = NULL;
4870 /* the following variable is used for caching string comparisons
4871 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4872 * 3=ignore, 4=xmlcharrefreplace */
4873 int known_errorHandler = -1;
4874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 if (mapping == NULL) {
4876 PyErr_BadArgument();
4877 return NULL;
4878 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879
4880 /* allocate enough for a simple 1:1 translation without
4881 replacements, if we need more, we'll resize */
4882 res = PyUnicode_FromUnicode(NULL, size);
4883 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 return res;
4887 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 while (p<endp) {
4890 /* try to encode it */
4891 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004892 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 goto onError;
4895 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004896 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 if (x!=Py_None) /* it worked => adjust input pointer */
4898 ++p;
4899 else { /* untranslatable character */
4900 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 Py_ssize_t repsize;
4902 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 Py_UNICODE *uni2;
4904 /* startpos for collecting untranslatable chars */
4905 const Py_UNICODE *collstart = p;
4906 const Py_UNICODE *collend = p+1;
4907 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 /* find all untranslatable characters */
4910 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004911 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 goto onError;
4913 Py_XDECREF(x);
4914 if (x!=Py_None)
4915 break;
4916 ++collend;
4917 }
4918 /* cache callback name lookup
4919 * (if not done yet, i.e. it's the first error) */
4920 if (known_errorHandler==-1) {
4921 if ((errors==NULL) || (!strcmp(errors, "strict")))
4922 known_errorHandler = 1;
4923 else if (!strcmp(errors, "replace"))
4924 known_errorHandler = 2;
4925 else if (!strcmp(errors, "ignore"))
4926 known_errorHandler = 3;
4927 else if (!strcmp(errors, "xmlcharrefreplace"))
4928 known_errorHandler = 4;
4929 else
4930 known_errorHandler = 0;
4931 }
4932 switch (known_errorHandler) {
4933 case 1: /* strict */
4934 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4935 goto onError;
4936 case 2: /* replace */
4937 /* No need to check for space, this is a 1:1 replacement */
4938 for (coll = collstart; coll<collend; ++coll)
4939 *str++ = '?';
4940 /* fall through */
4941 case 3: /* ignore */
4942 p = collend;
4943 break;
4944 case 4: /* xmlcharrefreplace */
4945 /* generate replacement (temporarily (mis)uses p) */
4946 for (p = collstart; p < collend; ++p) {
4947 char buffer[2+29+1+1];
4948 char *cp;
4949 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004950 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4952 goto onError;
4953 for (cp = buffer; *cp; ++cp)
4954 *str++ = *cp;
4955 }
4956 p = collend;
4957 break;
4958 default:
4959 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4960 reason, startp, size, &exc,
4961 collstart-startp, collend-startp, &newpos);
4962 if (repunicode == NULL)
4963 goto onError;
4964 /* generate replacement */
4965 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004966 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4968 Py_DECREF(repunicode);
4969 goto onError;
4970 }
4971 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4972 *str++ = *uni2;
4973 p = startp + newpos;
4974 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 }
4976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 /* Resize if we allocated to much */
4979 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004980 if (respos<PyUnicode_GET_SIZE(res)) {
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00004981 if (PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004982 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 }
4984 Py_XDECREF(exc);
4985 Py_XDECREF(errorHandler);
4986 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 onError:
4989 Py_XDECREF(res);
4990 Py_XDECREF(exc);
4991 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 return NULL;
4993}
4994
4995PyObject *PyUnicode_Translate(PyObject *str,
4996 PyObject *mapping,
4997 const char *errors)
4998{
4999 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 str = PyUnicode_FromObject(str);
5002 if (str == NULL)
5003 goto onError;
5004 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5005 PyUnicode_GET_SIZE(str),
5006 mapping,
5007 errors);
5008 Py_DECREF(str);
5009 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005010
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011 onError:
5012 Py_XDECREF(str);
5013 return NULL;
5014}
Tim Petersced69f82003-09-16 20:30:58 +00005015
Guido van Rossum9e896b32000-04-05 20:11:21 +00005016/* --- Decimal Encoder ---------------------------------------------------- */
5017
5018int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005019 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005020 char *output,
5021 const char *errors)
5022{
5023 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 PyObject *errorHandler = NULL;
5025 PyObject *exc = NULL;
5026 const char *encoding = "decimal";
5027 const char *reason = "invalid decimal Unicode string";
5028 /* the following variable is used for caching string comparisons
5029 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5030 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005031
5032 if (output == NULL) {
5033 PyErr_BadArgument();
5034 return -1;
5035 }
5036
5037 p = s;
5038 end = s + length;
5039 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005041 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005043 Py_ssize_t repsize;
5044 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 Py_UNICODE *uni2;
5046 Py_UNICODE *collstart;
5047 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005048
Guido van Rossum9e896b32000-04-05 20:11:21 +00005049 if (Py_UNICODE_ISSPACE(ch)) {
5050 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005052 continue;
5053 }
5054 decimal = Py_UNICODE_TODECIMAL(ch);
5055 if (decimal >= 0) {
5056 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005058 continue;
5059 }
Guido van Rossumba477042000-04-06 18:18:10 +00005060 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005061 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005063 continue;
5064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 /* All other characters are considered unencodable */
5066 collstart = p;
5067 collend = p+1;
5068 while (collend < end) {
5069 if ((0 < *collend && *collend < 256) ||
5070 !Py_UNICODE_ISSPACE(*collend) ||
5071 Py_UNICODE_TODECIMAL(*collend))
5072 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 /* cache callback name lookup
5075 * (if not done yet, i.e. it's the first error) */
5076 if (known_errorHandler==-1) {
5077 if ((errors==NULL) || (!strcmp(errors, "strict")))
5078 known_errorHandler = 1;
5079 else if (!strcmp(errors, "replace"))
5080 known_errorHandler = 2;
5081 else if (!strcmp(errors, "ignore"))
5082 known_errorHandler = 3;
5083 else if (!strcmp(errors, "xmlcharrefreplace"))
5084 known_errorHandler = 4;
5085 else
5086 known_errorHandler = 0;
5087 }
5088 switch (known_errorHandler) {
5089 case 1: /* strict */
5090 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5091 goto onError;
5092 case 2: /* replace */
5093 for (p = collstart; p < collend; ++p)
5094 *output++ = '?';
5095 /* fall through */
5096 case 3: /* ignore */
5097 p = collend;
5098 break;
5099 case 4: /* xmlcharrefreplace */
5100 /* generate replacement (temporarily (mis)uses p) */
5101 for (p = collstart; p < collend; ++p)
5102 output += sprintf(output, "&#%d;", (int)*p);
5103 p = collend;
5104 break;
5105 default:
5106 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5107 encoding, reason, s, length, &exc,
5108 collstart-s, collend-s, &newpos);
5109 if (repunicode == NULL)
5110 goto onError;
5111 /* generate replacement */
5112 repsize = PyUnicode_GET_SIZE(repunicode);
5113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5114 Py_UNICODE ch = *uni2;
5115 if (Py_UNICODE_ISSPACE(ch))
5116 *output++ = ' ';
5117 else {
5118 decimal = Py_UNICODE_TODECIMAL(ch);
5119 if (decimal >= 0)
5120 *output++ = '0' + decimal;
5121 else if (0 < ch && ch < 256)
5122 *output++ = (char)ch;
5123 else {
5124 Py_DECREF(repunicode);
5125 raise_encode_exception(&exc, encoding,
5126 s, length, collstart-s, collend-s, reason);
5127 goto onError;
5128 }
5129 }
5130 }
5131 p = s + newpos;
5132 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005133 }
5134 }
5135 /* 0-terminate the output string */
5136 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 Py_XDECREF(exc);
5138 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005139 return 0;
5140
5141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 Py_XDECREF(exc);
5143 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005144 return -1;
5145}
5146
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147/* --- Helpers ------------------------------------------------------------ */
5148
Eric Smitha9f7d622008-02-17 19:46:49 +00005149#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005150
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005151#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005152
Fredrik Lundha50d2012006-05-26 17:04:58 +00005153#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005154
5155#include "stringlib/count.h"
5156#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005157#include "stringlib/partition.h"
5158
Fredrik Lundhc8162812006-05-26 19:33:03 +00005159/* helper macro to fixup start/end slice values */
5160#define FIX_START_END(obj) \
5161 if (start < 0) \
5162 start += (obj)->length; \
5163 if (start < 0) \
5164 start = 0; \
5165 if (end > (obj)->length) \
5166 end = (obj)->length; \
5167 if (end < 0) \
5168 end += (obj)->length; \
5169 if (end < 0) \
5170 end = 0;
5171
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005173 PyObject *substr,
5174 Py_ssize_t start,
5175 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005177 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005178 PyUnicodeObject* str_obj;
5179 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005180
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005181 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5182 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005184 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5185 if (!sub_obj) {
5186 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 return -1;
5188 }
Tim Petersced69f82003-09-16 20:30:58 +00005189
Fredrik Lundhc8162812006-05-26 19:33:03 +00005190 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005191
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005192 result = stringlib_count(
5193 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5194 );
5195
5196 Py_DECREF(sub_obj);
5197 Py_DECREF(str_obj);
5198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return result;
5200}
5201
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005203 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005204 Py_ssize_t start,
5205 Py_ssize_t end,
5206 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005209
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005210 str = PyUnicode_FromObject(str);
5211 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005212 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005213 sub = PyUnicode_FromObject(sub);
5214 if (!sub) {
5215 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005216 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 }
Tim Petersced69f82003-09-16 20:30:58 +00005218
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005219 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005220 result = stringlib_find_slice(
5221 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5222 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5223 start, end
5224 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005225 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005226 result = stringlib_rfind_slice(
5227 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5228 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5229 start, end
5230 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005231
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005232 Py_DECREF(str);
5233 Py_DECREF(sub);
5234
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 return result;
5236}
5237
Tim Petersced69f82003-09-16 20:30:58 +00005238static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239int tailmatch(PyUnicodeObject *self,
5240 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005241 Py_ssize_t start,
5242 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 int direction)
5244{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 if (substring->length == 0)
5246 return 1;
5247
Fredrik Lundhc8162812006-05-26 19:33:03 +00005248 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
5250 end -= substring->length;
5251 if (end < start)
5252 return 0;
5253
5254 if (direction > 0) {
5255 if (Py_UNICODE_MATCH(self, end, substring))
5256 return 1;
5257 } else {
5258 if (Py_UNICODE_MATCH(self, start, substring))
5259 return 1;
5260 }
5261
5262 return 0;
5263}
5264
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t start,
5268 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 int direction)
5270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 str = PyUnicode_FromObject(str);
5274 if (str == NULL)
5275 return -1;
5276 substr = PyUnicode_FromObject(substr);
5277 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005278 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 return -1;
5280 }
Tim Petersced69f82003-09-16 20:30:58 +00005281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 result = tailmatch((PyUnicodeObject *)str,
5283 (PyUnicodeObject *)substr,
5284 start, end, direction);
5285 Py_DECREF(str);
5286 Py_DECREF(substr);
5287 return result;
5288}
5289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290/* Apply fixfct filter to the Unicode object self and return a
5291 reference to the modified object */
5292
Tim Petersced69f82003-09-16 20:30:58 +00005293static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294PyObject *fixup(PyUnicodeObject *self,
5295 int (*fixfct)(PyUnicodeObject *s))
5296{
5297
5298 PyUnicodeObject *u;
5299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005300 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 if (u == NULL)
5302 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005303
5304 Py_UNICODE_COPY(u->str, self->str, self->length);
5305
Tim Peters7a29bd52001-09-12 03:03:31 +00005306 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 /* fixfct should return TRUE if it modified the buffer. If
5308 FALSE, return a reference to the original buffer instead
5309 (to save space, not time) */
5310 Py_INCREF(self);
5311 Py_DECREF(u);
5312 return (PyObject*) self;
5313 }
5314 return (PyObject*) u;
5315}
5316
Tim Petersced69f82003-09-16 20:30:58 +00005317static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318int fixupper(PyUnicodeObject *self)
5319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005320 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 Py_UNICODE *s = self->str;
5322 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 while (len-- > 0) {
5325 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 ch = Py_UNICODE_TOUPPER(*s);
5328 if (ch != *s) {
5329 status = 1;
5330 *s = ch;
5331 }
5332 s++;
5333 }
5334
5335 return status;
5336}
5337
Tim Petersced69f82003-09-16 20:30:58 +00005338static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339int fixlower(PyUnicodeObject *self)
5340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 Py_UNICODE *s = self->str;
5343 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 while (len-- > 0) {
5346 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005347
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 ch = Py_UNICODE_TOLOWER(*s);
5349 if (ch != *s) {
5350 status = 1;
5351 *s = ch;
5352 }
5353 s++;
5354 }
5355
5356 return status;
5357}
5358
Tim Petersced69f82003-09-16 20:30:58 +00005359static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360int fixswapcase(PyUnicodeObject *self)
5361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 Py_UNICODE *s = self->str;
5364 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 while (len-- > 0) {
5367 if (Py_UNICODE_ISUPPER(*s)) {
5368 *s = Py_UNICODE_TOLOWER(*s);
5369 status = 1;
5370 } else if (Py_UNICODE_ISLOWER(*s)) {
5371 *s = Py_UNICODE_TOUPPER(*s);
5372 status = 1;
5373 }
5374 s++;
5375 }
5376
5377 return status;
5378}
5379
Tim Petersced69f82003-09-16 20:30:58 +00005380static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381int fixcapitalize(PyUnicodeObject *self)
5382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005384 Py_UNICODE *s = self->str;
5385 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005387 if (len == 0)
5388 return 0;
5389 if (Py_UNICODE_ISLOWER(*s)) {
5390 *s = Py_UNICODE_TOUPPER(*s);
5391 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005393 s++;
5394 while (--len > 0) {
5395 if (Py_UNICODE_ISUPPER(*s)) {
5396 *s = Py_UNICODE_TOLOWER(*s);
5397 status = 1;
5398 }
5399 s++;
5400 }
5401 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402}
5403
5404static
5405int fixtitle(PyUnicodeObject *self)
5406{
5407 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408 register Py_UNICODE *e;
5409 int previous_is_cased;
5410
5411 /* Shortcut for single character strings */
5412 if (PyUnicode_GET_SIZE(self) == 1) {
5413 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5414 if (*p != ch) {
5415 *p = ch;
5416 return 1;
5417 }
5418 else
5419 return 0;
5420 }
Tim Petersced69f82003-09-16 20:30:58 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 e = p + PyUnicode_GET_SIZE(self);
5423 previous_is_cased = 0;
5424 for (; p < e; p++) {
5425 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 if (previous_is_cased)
5428 *p = Py_UNICODE_TOLOWER(ch);
5429 else
5430 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005431
5432 if (Py_UNICODE_ISLOWER(ch) ||
5433 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 Py_UNICODE_ISTITLE(ch))
5435 previous_is_cased = 1;
5436 else
5437 previous_is_cased = 0;
5438 }
5439 return 1;
5440}
5441
Tim Peters8ce9f162004-08-27 01:49:32 +00005442PyObject *
5443PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Tim Peters8ce9f162004-08-27 01:49:32 +00005445 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005446 const Py_UNICODE blank = ' ';
5447 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005448 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005450 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5451 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5453 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005455 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005456 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 fseq = PySequence_Fast(seq, "");
5459 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005460 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005461 }
5462
Tim Peters91879ab2004-08-27 22:35:44 +00005463 /* Grrrr. A codec may be invoked to convert str objects to
5464 * Unicode, and so it's possible to call back into Python code
5465 * during PyUnicode_FromObject(), and so it's possible for a sick
5466 * codec to change the size of fseq (if seq is a list). Therefore
5467 * we have to keep refetching the size -- can't assume seqlen
5468 * is invariant.
5469 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 seqlen = PySequence_Fast_GET_SIZE(fseq);
5471 /* If empty sequence, return u"". */
5472 if (seqlen == 0) {
5473 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5474 goto Done;
5475 }
5476 /* If singleton sequence with an exact Unicode, return that. */
5477 if (seqlen == 1) {
5478 item = PySequence_Fast_GET_ITEM(fseq, 0);
5479 if (PyUnicode_CheckExact(item)) {
5480 Py_INCREF(item);
5481 res = (PyUnicodeObject *)item;
5482 goto Done;
5483 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 }
5485
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 /* At least two items to join, or one that isn't exact Unicode. */
5487 if (seqlen > 1) {
5488 /* Set up sep and seplen -- they're needed. */
5489 if (separator == NULL) {
5490 sep = &blank;
5491 seplen = 1;
5492 }
5493 else {
5494 internal_separator = PyUnicode_FromObject(separator);
5495 if (internal_separator == NULL)
5496 goto onError;
5497 sep = PyUnicode_AS_UNICODE(internal_separator);
5498 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005499 /* In case PyUnicode_FromObject() mutated seq. */
5500 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 }
5502 }
5503
5504 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005505 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005506 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 res_p = PyUnicode_AS_UNICODE(res);
5509 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005510
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005512 Py_ssize_t itemlen;
5513 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005514
5515 item = PySequence_Fast_GET_ITEM(fseq, i);
5516 /* Convert item to Unicode. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00005517 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005518 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005519 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005520 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005521 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005522 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005523 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 item = PyUnicode_FromObject(item);
5525 if (item == NULL)
5526 goto onError;
5527 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005528
Tim Peters91879ab2004-08-27 22:35:44 +00005529 /* In case PyUnicode_FromObject() mutated seq. */
5530 seqlen = PySequence_Fast_GET_SIZE(fseq);
5531
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005534 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005535 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005536 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005537 if (i < seqlen - 1) {
5538 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005539 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005540 goto Overflow;
5541 }
5542 if (new_res_used > res_alloc) {
5543 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005544 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005545 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005546 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005547 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005549 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005550 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005552 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005555
5556 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005557 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005558 res_p += itemlen;
5559 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005560 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005561 res_p += seplen;
5562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005564 res_used = new_res_used;
5565 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005566
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 /* Shrink res to match the used area; this probably can't fail,
5568 * but it's cheap to check.
5569 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005570 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005571 goto onError;
5572
5573 Done:
5574 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 return (PyObject *)res;
5577
Tim Peters8ce9f162004-08-27 01:49:32 +00005578 Overflow:
5579 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005580 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005581 Py_DECREF(item);
5582 /* fall through */
5583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005586 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005587 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 return NULL;
5589}
5590
Tim Petersced69f82003-09-16 20:30:58 +00005591static
5592PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005593 Py_ssize_t left,
5594 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 Py_UNICODE fill)
5596{
5597 PyUnicodeObject *u;
5598
5599 if (left < 0)
5600 left = 0;
5601 if (right < 0)
5602 right = 0;
5603
Tim Peters7a29bd52001-09-12 03:03:31 +00005604 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 Py_INCREF(self);
5606 return self;
5607 }
5608
Neal Norwitze7d8be82008-07-31 17:17:14 +00005609 if (left > PY_SSIZE_T_MAX - self->length ||
5610 right > PY_SSIZE_T_MAX - (left + self->length)) {
5611 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5612 return NULL;
5613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 u = _PyUnicode_New(left + self->length + right);
5615 if (u) {
5616 if (left)
5617 Py_UNICODE_FILL(u->str, fill, left);
5618 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5619 if (right)
5620 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5621 }
5622
5623 return u;
5624}
5625
5626#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005627 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 if (!str) \
5629 goto onError; \
5630 if (PyList_Append(list, str)) { \
5631 Py_DECREF(str); \
5632 goto onError; \
5633 } \
5634 else \
5635 Py_DECREF(str);
5636
5637static
5638PyObject *split_whitespace(PyUnicodeObject *self,
5639 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005646 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
5648 for (i = j = 0; i < len; ) {
5649 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005650 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 i++;
5652 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005653 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 i++;
5655 if (j < i) {
5656 if (maxcount-- <= 0)
5657 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005658 SPLIT_APPEND(buf, j, i);
5659 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 i++;
5661 j = i;
5662 }
5663 }
5664 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005665 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
5667 return list;
5668
5669 onError:
5670 Py_DECREF(list);
5671 return NULL;
5672}
5673
5674PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005675 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 register Py_ssize_t i;
5678 register Py_ssize_t j;
5679 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 PyObject *list;
5681 PyObject *str;
5682 Py_UNICODE *data;
5683
5684 string = PyUnicode_FromObject(string);
5685 if (string == NULL)
5686 return NULL;
5687 data = PyUnicode_AS_UNICODE(string);
5688 len = PyUnicode_GET_SIZE(string);
5689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 list = PyList_New(0);
5691 if (!list)
5692 goto onError;
5693
5694 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005698 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
5701 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005702 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 if (i < len) {
5704 if (data[i] == '\r' && i + 1 < len &&
5705 data[i+1] == '\n')
5706 i += 2;
5707 else
5708 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005709 if (keepends)
5710 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 }
Guido van Rossum86662912000-04-11 15:38:46 +00005712 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 j = i;
5714 }
5715 if (j < len) {
5716 SPLIT_APPEND(data, j, len);
5717 }
5718
5719 Py_DECREF(string);
5720 return list;
5721
5722 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005723 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 Py_DECREF(string);
5725 return NULL;
5726}
5727
Tim Petersced69f82003-09-16 20:30:58 +00005728static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729PyObject *split_char(PyUnicodeObject *self,
5730 PyObject *list,
5731 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005732 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 register Py_ssize_t i;
5735 register Py_ssize_t j;
5736 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005738 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005741 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 if (maxcount-- <= 0)
5743 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005744 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 i = j = i + 1;
5746 } else
5747 i++;
5748 }
5749 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005750 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
5752 return list;
5753
5754 onError:
5755 Py_DECREF(list);
5756 return NULL;
5757}
5758
Tim Petersced69f82003-09-16 20:30:58 +00005759static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760PyObject *split_substring(PyUnicodeObject *self,
5761 PyObject *list,
5762 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005763 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005765 register Py_ssize_t i;
5766 register Py_ssize_t j;
5767 Py_ssize_t len = self->length;
5768 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 PyObject *str;
5770
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005771 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 if (Py_UNICODE_MATCH(self, i, substring)) {
5773 if (maxcount-- <= 0)
5774 break;
5775 SPLIT_APPEND(self->str, j, i);
5776 i = j = i + sublen;
5777 } else
5778 i++;
5779 }
5780 if (j <= len) {
5781 SPLIT_APPEND(self->str, j, len);
5782 }
5783 return list;
5784
5785 onError:
5786 Py_DECREF(list);
5787 return NULL;
5788}
5789
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790static
5791PyObject *rsplit_whitespace(PyUnicodeObject *self,
5792 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 register Py_ssize_t i;
5796 register Py_ssize_t j;
5797 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005799 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800
5801 for (i = j = len - 1; i >= 0; ) {
5802 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005803 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 i--;
5805 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005806 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807 i--;
5808 if (j > i) {
5809 if (maxcount-- <= 0)
5810 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005811 SPLIT_APPEND(buf, i + 1, j + 1);
5812 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005813 i--;
5814 j = i;
5815 }
5816 }
5817 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005818 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005819 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005820 if (PyList_Reverse(list) < 0)
5821 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822 return list;
5823
5824 onError:
5825 Py_DECREF(list);
5826 return NULL;
5827}
5828
5829static
5830PyObject *rsplit_char(PyUnicodeObject *self,
5831 PyObject *list,
5832 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 register Py_ssize_t i;
5836 register Py_ssize_t j;
5837 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005839 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840
5841 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005842 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005843 if (maxcount-- <= 0)
5844 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005845 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005846 j = i = i - 1;
5847 } else
5848 i--;
5849 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005850 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005851 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005853 if (PyList_Reverse(list) < 0)
5854 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005855 return list;
5856
5857 onError:
5858 Py_DECREF(list);
5859 return NULL;
5860}
5861
5862static
5863PyObject *rsplit_substring(PyUnicodeObject *self,
5864 PyObject *list,
5865 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len = self->length;
5871 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872 PyObject *str;
5873
5874 for (i = len - sublen, j = len; i >= 0; ) {
5875 if (Py_UNICODE_MATCH(self, i, substring)) {
5876 if (maxcount-- <= 0)
5877 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005878 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005879 j = i;
5880 i -= sublen;
5881 } else
5882 i--;
5883 }
5884 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005885 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005886 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005887 if (PyList_Reverse(list) < 0)
5888 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005889 return list;
5890
5891 onError:
5892 Py_DECREF(list);
5893 return NULL;
5894}
5895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896#undef SPLIT_APPEND
5897
5898static
5899PyObject *split(PyUnicodeObject *self,
5900 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005901 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
5903 PyObject *list;
5904
5905 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005906 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
5908 list = PyList_New(0);
5909 if (!list)
5910 return NULL;
5911
5912 if (substring == NULL)
5913 return split_whitespace(self,list,maxcount);
5914
5915 else if (substring->length == 1)
5916 return split_char(self,list,substring->str[0],maxcount);
5917
5918 else if (substring->length == 0) {
5919 Py_DECREF(list);
5920 PyErr_SetString(PyExc_ValueError, "empty separator");
5921 return NULL;
5922 }
5923 else
5924 return split_substring(self,list,substring,maxcount);
5925}
5926
Tim Petersced69f82003-09-16 20:30:58 +00005927static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928PyObject *rsplit(PyUnicodeObject *self,
5929 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005930 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931{
5932 PyObject *list;
5933
5934 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005935 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005936
5937 list = PyList_New(0);
5938 if (!list)
5939 return NULL;
5940
5941 if (substring == NULL)
5942 return rsplit_whitespace(self,list,maxcount);
5943
5944 else if (substring->length == 1)
5945 return rsplit_char(self,list,substring->str[0],maxcount);
5946
5947 else if (substring->length == 0) {
5948 Py_DECREF(list);
5949 PyErr_SetString(PyExc_ValueError, "empty separator");
5950 return NULL;
5951 }
5952 else
5953 return rsplit_substring(self,list,substring,maxcount);
5954}
5955
5956static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957PyObject *replace(PyUnicodeObject *self,
5958 PyUnicodeObject *str1,
5959 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005960 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961{
5962 PyUnicodeObject *u;
5963
5964 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005965 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Fredrik Lundh347ee272006-05-24 16:35:18 +00005967 if (str1->length == str2->length) {
5968 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005969 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970 if (str1->length == 1) {
5971 /* replace characters */
5972 Py_UNICODE u1, u2;
5973 if (!findchar(self->str, self->length, str1->str[0]))
5974 goto nothing;
5975 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5976 if (!u)
5977 return NULL;
5978 Py_UNICODE_COPY(u->str, self->str, self->length);
5979 u1 = str1->str[0];
5980 u2 = str2->str[0];
5981 for (i = 0; i < u->length; i++)
5982 if (u->str[i] == u1) {
5983 if (--maxcount < 0)
5984 break;
5985 u->str[i] = u2;
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005988 i = fastsearch(
5989 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 if (i < 0)
5992 goto nothing;
5993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994 if (!u)
5995 return NULL;
5996 Py_UNICODE_COPY(u->str, self->str, self->length);
5997 while (i <= self->length - str1->length)
5998 if (Py_UNICODE_MATCH(self, i, str1)) {
5999 if (--maxcount < 0)
6000 break;
6001 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6002 i += str1->length;
6003 } else
6004 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006007
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006008 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006009 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 Py_UNICODE *p;
6011
6012 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006013 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 if (n > maxcount)
6015 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006016 if (n == 0)
6017 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006018 /* new_size = self->length + n * (str2->length - str1->length)); */
6019 delta = (str2->length - str1->length);
6020 if (delta == 0) {
6021 new_size = self->length;
6022 } else {
6023 product = n * (str2->length - str1->length);
6024 if ((product / (str2->length - str1->length)) != n) {
6025 PyErr_SetString(PyExc_OverflowError,
6026 "replace string is too long");
6027 return NULL;
6028 }
6029 new_size = self->length + product;
6030 if (new_size < 0) {
6031 PyErr_SetString(PyExc_OverflowError,
6032 "replace string is too long");
6033 return NULL;
6034 }
6035 }
6036 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006037 if (!u)
6038 return NULL;
6039 i = 0;
6040 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006041 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006042 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006043 while (n-- > 0) {
6044 /* look for next match */
6045 j = i;
6046 while (j <= e) {
6047 if (Py_UNICODE_MATCH(self, j, str1))
6048 break;
6049 j++;
6050 }
6051 if (j > i) {
6052 if (j > e)
6053 break;
6054 /* copy unchanged part [i:j] */
6055 Py_UNICODE_COPY(p, self->str+i, j-i);
6056 p += j - i;
6057 }
6058 /* copy substitution string */
6059 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006060 Py_UNICODE_COPY(p, str2->str, str2->length);
6061 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006062 }
6063 i = j + str1->length;
6064 }
6065 if (i < self->length)
6066 /* copy tail [i:] */
6067 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006068 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006069 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006070 while (n > 0) {
6071 Py_UNICODE_COPY(p, str2->str, str2->length);
6072 p += str2->length;
6073 if (--n <= 0)
6074 break;
6075 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006077 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006081
6082nothing:
6083 /* nothing to replace; return original string (when possible) */
6084 if (PyUnicode_CheckExact(self)) {
6085 Py_INCREF(self);
6086 return (PyObject *) self;
6087 }
6088 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089}
6090
6091/* --- Unicode Object Methods --------------------------------------------- */
6092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094"S.title() -> unicode\n\
6095\n\
6096Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 return fixup(self, fixtitle);
6103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106"S.capitalize() -> unicode\n\
6107\n\
6108Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006112unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return fixup(self, fixcapitalize);
6115}
6116
6117#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006118PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119"S.capwords() -> unicode\n\
6120\n\
6121Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006122normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
6124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006125unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126{
6127 PyObject *list;
6128 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 /* Split into words */
6132 list = split(self, NULL, -1);
6133 if (!list)
6134 return NULL;
6135
6136 /* Capitalize each word */
6137 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6138 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6139 fixcapitalize);
6140 if (item == NULL)
6141 goto onError;
6142 Py_DECREF(PyList_GET_ITEM(list, i));
6143 PyList_SET_ITEM(list, i, item);
6144 }
6145
6146 /* Join the words to form a new string */
6147 item = PyUnicode_Join(NULL, list);
6148
6149onError:
6150 Py_DECREF(list);
6151 return (PyObject *)item;
6152}
6153#endif
6154
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006155/* Argument converter. Coerces to a single unicode character */
6156
6157static int
6158convert_uc(PyObject *obj, void *addr)
6159{
6160 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6161 PyObject *uniobj;
6162 Py_UNICODE *unistr;
6163
6164 uniobj = PyUnicode_FromObject(obj);
6165 if (uniobj == NULL) {
6166 PyErr_SetString(PyExc_TypeError,
6167 "The fill character cannot be converted to Unicode");
6168 return 0;
6169 }
6170 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6171 PyErr_SetString(PyExc_TypeError,
6172 "The fill character must be exactly one character long");
6173 Py_DECREF(uniobj);
6174 return 0;
6175 }
6176 unistr = PyUnicode_AS_UNICODE(uniobj);
6177 *fillcharloc = unistr[0];
6178 Py_DECREF(uniobj);
6179 return 1;
6180}
6181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006185Return S centered in a Unicode string of length width. Padding is\n\
6186done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject *
6189unicode_center(PyUnicodeObject *self, PyObject *args)
6190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191 Py_ssize_t marg, left;
6192 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006193 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Thomas Woutersde017742006-02-16 19:34:37 +00006195 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 return NULL;
6197
Tim Peters7a29bd52001-09-12 03:03:31 +00006198 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 Py_INCREF(self);
6200 return (PyObject*) self;
6201 }
6202
6203 marg = width - self->length;
6204 left = marg / 2 + (marg & width & 1);
6205
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006206 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Marc-André Lemburge5034372000-08-08 08:04:29 +00006209#if 0
6210
6211/* This code should go into some future Unicode collation support
6212 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006213 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006214
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006215/* speedy UTF-16 code point order comparison */
6216/* gleaned from: */
6217/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6218
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006219static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006220{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006221 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006222 0, 0, 0, 0, 0, 0, 0, 0,
6223 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006224 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006225};
6226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227static int
6228unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_UNICODE *s1 = str1->str;
6233 Py_UNICODE *s2 = str2->str;
6234
6235 len1 = str1->length;
6236 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006239 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006240
6241 c1 = *s1++;
6242 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006243
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006244 if (c1 > (1<<11) * 26)
6245 c1 += utf16Fixup[c1>>11];
6246 if (c2 > (1<<11) * 26)
6247 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006248 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006249
6250 if (c1 != c2)
6251 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006252
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006253 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 }
6255
6256 return (len1 < len2) ? -1 : (len1 != len2);
6257}
6258
Marc-André Lemburge5034372000-08-08 08:04:29 +00006259#else
6260
6261static int
6262unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006264 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006265
6266 Py_UNICODE *s1 = str1->str;
6267 Py_UNICODE *s2 = str2->str;
6268
6269 len1 = str1->length;
6270 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006271
Marc-André Lemburge5034372000-08-08 08:04:29 +00006272 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006273 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006274
Fredrik Lundh45714e92001-06-26 16:39:36 +00006275 c1 = *s1++;
6276 c2 = *s2++;
6277
6278 if (c1 != c2)
6279 return (c1 < c2) ? -1 : 1;
6280
Marc-André Lemburge5034372000-08-08 08:04:29 +00006281 len1--; len2--;
6282 }
6283
6284 return (len1 < len2) ? -1 : (len1 != len2);
6285}
6286
6287#endif
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289int PyUnicode_Compare(PyObject *left,
6290 PyObject *right)
6291{
6292 PyUnicodeObject *u = NULL, *v = NULL;
6293 int result;
6294
6295 /* Coerce the two arguments */
6296 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6297 if (u == NULL)
6298 goto onError;
6299 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6300 if (v == NULL)
6301 goto onError;
6302
Thomas Wouters7e474022000-07-16 12:04:32 +00006303 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 if (v == u) {
6305 Py_DECREF(u);
6306 Py_DECREF(v);
6307 return 0;
6308 }
6309
6310 result = unicode_compare(u, v);
6311
6312 Py_DECREF(u);
6313 Py_DECREF(v);
6314 return result;
6315
6316onError:
6317 Py_XDECREF(u);
6318 Py_XDECREF(v);
6319 return -1;
6320}
6321
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006322PyObject *PyUnicode_RichCompare(PyObject *left,
6323 PyObject *right,
6324 int op)
6325{
6326 int result;
6327
6328 result = PyUnicode_Compare(left, right);
6329 if (result == -1 && PyErr_Occurred())
6330 goto onError;
6331
6332 /* Convert the return value to a Boolean */
6333 switch (op) {
6334 case Py_EQ:
6335 result = (result == 0);
6336 break;
6337 case Py_NE:
6338 result = (result != 0);
6339 break;
6340 case Py_LE:
6341 result = (result <= 0);
6342 break;
6343 case Py_GE:
6344 result = (result >= 0);
6345 break;
6346 case Py_LT:
6347 result = (result == -1);
6348 break;
6349 case Py_GT:
6350 result = (result == 1);
6351 break;
6352 }
6353 return PyBool_FromLong(result);
6354
6355 onError:
6356
6357 /* Standard case
6358
6359 Type errors mean that PyUnicode_FromObject() could not convert
6360 one of the arguments (usually the right hand side) to Unicode,
6361 ie. we can't handle the comparison request. However, it is
6362 possible that the other object knows a comparison method, which
6363 is why we return Py_NotImplemented to give the other object a
6364 chance.
6365
6366 */
6367 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6368 PyErr_Clear();
6369 Py_INCREF(Py_NotImplemented);
6370 return Py_NotImplemented;
6371 }
6372 if (op != Py_EQ && op != Py_NE)
6373 return NULL;
6374
6375 /* Equality comparison.
6376
6377 This is a special case: we silence any PyExc_UnicodeDecodeError
6378 and instead turn it into a PyErr_UnicodeWarning.
6379
6380 */
6381 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6382 return NULL;
6383 PyErr_Clear();
6384 if (PyErr_Warn(PyExc_UnicodeWarning,
6385 (op == Py_EQ) ?
6386 "Unicode equal comparison "
6387 "failed to convert both arguments to Unicode - "
6388 "interpreting them as being unequal" :
6389 "Unicode unequal comparison "
6390 "failed to convert both arguments to Unicode - "
6391 "interpreting them as being unequal"
6392 ) < 0)
6393 return NULL;
6394 result = (op == Py_NE);
6395 return PyBool_FromLong(result);
6396}
6397
Guido van Rossum403d68b2000-03-13 15:55:09 +00006398int PyUnicode_Contains(PyObject *container,
6399 PyObject *element)
6400{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006401 PyObject *str, *sub;
6402 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006403
6404 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006405 sub = PyUnicode_FromObject(element);
6406 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006407 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006408 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006409 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006410 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006411
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006412 str = PyUnicode_FromObject(container);
6413 if (!str) {
6414 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006415 return -1;
6416 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006417
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006418 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006419
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006420 Py_DECREF(str);
6421 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006422
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006423 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006424}
6425
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426/* Concat to string or Unicode object giving a new Unicode object. */
6427
6428PyObject *PyUnicode_Concat(PyObject *left,
6429 PyObject *right)
6430{
6431 PyUnicodeObject *u = NULL, *v = NULL, *w;
6432
6433 /* Coerce the two arguments */
6434 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6435 if (u == NULL)
6436 goto onError;
6437 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6438 if (v == NULL)
6439 goto onError;
6440
6441 /* Shortcuts */
6442 if (v == unicode_empty) {
6443 Py_DECREF(v);
6444 return (PyObject *)u;
6445 }
6446 if (u == unicode_empty) {
6447 Py_DECREF(u);
6448 return (PyObject *)v;
6449 }
6450
6451 /* Concat the two Unicode strings */
6452 w = _PyUnicode_New(u->length + v->length);
6453 if (w == NULL)
6454 goto onError;
6455 Py_UNICODE_COPY(w->str, u->str, u->length);
6456 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6457
6458 Py_DECREF(u);
6459 Py_DECREF(v);
6460 return (PyObject *)w;
6461
6462onError:
6463 Py_XDECREF(u);
6464 Py_XDECREF(v);
6465 return NULL;
6466}
6467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006468PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469"S.count(sub[, start[, end]]) -> int\n\
6470\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006471Return the number of non-overlapping occurrences of substring sub in\n\
6472Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006473interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject *
6476unicode_count(PyUnicodeObject *self, PyObject *args)
6477{
6478 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006480 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 PyObject *result;
6482
Guido van Rossumb8872e62000-05-09 14:14:27 +00006483 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6484 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return NULL;
6486
6487 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006488 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 if (substring == NULL)
6490 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006491
Fredrik Lundhc8162812006-05-26 19:33:03 +00006492 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006494 result = PyInt_FromSsize_t(
6495 stringlib_count(self->str + start, end - start,
6496 substring->str, substring->length)
6497 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006500
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 return result;
6502}
6503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006505"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006507Encodes S using the codec registered for encoding. encoding defaults\n\
6508to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006509handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6511'xmlcharrefreplace' as well as any other name registered with\n\
6512codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514static PyObject *
6515unicode_encode(PyUnicodeObject *self, PyObject *args)
6516{
6517 char *encoding = NULL;
6518 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 PyObject *v;
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6522 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006523 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006524 if (v == NULL)
6525 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006526 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006527 PyErr_Format(PyExc_TypeError,
6528 "encoder did not return a string/unicode object "
6529 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006530 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006531 Py_DECREF(v);
6532 return NULL;
6533 }
6534 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006535
6536 onError:
6537 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006538}
6539
6540PyDoc_STRVAR(decode__doc__,
6541"S.decode([encoding[,errors]]) -> string or unicode\n\
6542\n\
6543Decodes S using the codec registered for encoding. encoding defaults\n\
6544to the default encoding. errors may be given to set a different error\n\
6545handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6546a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6547as well as any other name registerd with codecs.register_error that is\n\
6548able to handle UnicodeDecodeErrors.");
6549
6550static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006551unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006552{
6553 char *encoding = NULL;
6554 char *errors = NULL;
6555 PyObject *v;
6556
6557 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6558 return NULL;
6559 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006560 if (v == NULL)
6561 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006562 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006563 PyErr_Format(PyExc_TypeError,
6564 "decoder did not return a string/unicode object "
6565 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006566 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006567 Py_DECREF(v);
6568 return NULL;
6569 }
6570 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006571
6572 onError:
6573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574}
6575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577"S.expandtabs([tabsize]) -> unicode\n\
6578\n\
6579Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582static PyObject*
6583unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6584{
6585 Py_UNICODE *e;
6586 Py_UNICODE *p;
6587 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006588 Py_UNICODE *qe;
6589 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 PyUnicodeObject *u;
6591 int tabsize = 8;
6592
6593 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6594 return NULL;
6595
Thomas Wouters7e474022000-07-16 12:04:32 +00006596 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006597 i = 0; /* chars up to and including most recent \n or \r */
6598 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6599 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 for (p = self->str; p < e; p++)
6601 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006602 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006603 incr = tabsize - (j % tabsize); /* cannot overflow */
6604 if (j > PY_SSIZE_T_MAX - incr)
6605 goto overflow1;
6606 j += incr;
6607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 }
6609 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006610 if (j > PY_SSIZE_T_MAX - 1)
6611 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 j++;
6613 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006614 if (i > PY_SSIZE_T_MAX - j)
6615 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006617 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 }
6619 }
6620
Guido van Rossum5bdff602008-03-11 21:18:06 +00006621 if (i > PY_SSIZE_T_MAX - j)
6622 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 /* Second pass: create output string and fill it */
6625 u = _PyUnicode_New(i + j);
6626 if (!u)
6627 return NULL;
6628
Guido van Rossum5bdff602008-03-11 21:18:06 +00006629 j = 0; /* same as in first pass */
6630 q = u->str; /* next output char */
6631 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
6633 for (p = self->str; p < e; p++)
6634 if (*p == '\t') {
6635 if (tabsize > 0) {
6636 i = tabsize - (j % tabsize);
6637 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006638 while (i--) {
6639 if (q >= qe)
6640 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 }
6644 }
6645 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006646 if (q >= qe)
6647 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006649 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 if (*p == '\n' || *p == '\r')
6651 j = 0;
6652 }
6653
6654 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006655
6656 overflow2:
6657 Py_DECREF(u);
6658 overflow1:
6659 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664"S.find(sub [,start [,end]]) -> int\n\
6665\n\
6666Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006667such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668arguments start and end are interpreted as in slice notation.\n\
6669\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject *
6673unicode_find(PyUnicodeObject *self, PyObject *args)
6674{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006675 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006676 Py_ssize_t start;
6677 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006678 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
Facundo Batista57d56692007-11-16 18:04:14 +00006680 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006683 result = stringlib_find_slice(
6684 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6685 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6686 start, end
6687 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006690
6691 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
6694static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
6697 if (index < 0 || index >= self->length) {
6698 PyErr_SetString(PyExc_IndexError, "string index out of range");
6699 return NULL;
6700 }
6701
6702 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6703}
6704
6705static long
6706unicode_hash(PyUnicodeObject *self)
6707{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006708 /* Since Unicode objects compare equal to their ASCII string
6709 counterparts, they should use the individual character values
6710 as basis for their hash value. This is needed to assure that
6711 strings and Unicode objects behave in the same way as
6712 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006715 register Py_UNICODE *p;
6716 register long x;
6717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (self->hash != -1)
6719 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006720 len = PyUnicode_GET_SIZE(self);
6721 p = PyUnicode_AS_UNICODE(self);
6722 x = *p << 7;
6723 while (--len >= 0)
6724 x = (1000003*x) ^ *p++;
6725 x ^= PyUnicode_GET_SIZE(self);
6726 if (x == -1)
6727 x = -2;
6728 self->hash = x;
6729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733"S.index(sub [,start [,end]]) -> int\n\
6734\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
6737static PyObject *
6738unicode_index(PyUnicodeObject *self, PyObject *args)
6739{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006740 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006741 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006742 Py_ssize_t start;
6743 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744
Facundo Batista57d56692007-11-16 18:04:14 +00006745 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006748 result = stringlib_find_slice(
6749 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6750 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6751 start, end
6752 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (result < 0) {
6757 PyErr_SetString(PyExc_ValueError, "substring not found");
6758 return NULL;
6759 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006760
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006764PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774 register const Py_UNICODE *e;
6775 int cased;
6776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006781 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006782 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 e = p + PyUnicode_GET_SIZE(self);
6786 cased = 0;
6787 for (; p < e; p++) {
6788 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 else if (!cased && Py_UNICODE_ISLOWER(ch))
6793 cased = 1;
6794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006801Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006805unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
6807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808 register const Py_UNICODE *e;
6809 int cased;
6810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006815 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006816 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 e = p + PyUnicode_GET_SIZE(self);
6820 cased = 0;
6821 for (; p < e; p++) {
6822 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 else if (!cased && Py_UNICODE_ISUPPER(ch))
6827 cased = 1;
6828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006835Return True if S is a titlecased string and there is at least one\n\
6836character in S, i.e. upper- and titlecase characters may only\n\
6837follow uncased characters and lowercase characters only cased ones.\n\
6838Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
6840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006841unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842{
6843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6844 register const Py_UNICODE *e;
6845 int cased, previous_is_cased;
6846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 /* Shortcut for single character strings */
6848 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6850 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006853 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 e = p + PyUnicode_GET_SIZE(self);
6857 cased = 0;
6858 previous_is_cased = 0;
6859 for (; p < e; p++) {
6860 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006861
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6863 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 previous_is_cased = 1;
6866 cased = 1;
6867 }
6868 else if (Py_UNICODE_ISLOWER(ch)) {
6869 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 previous_is_cased = 1;
6872 cased = 1;
6873 }
6874 else
6875 previous_is_cased = 0;
6876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006883Return True if all characters in S are whitespace\n\
6884and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006887unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
6894 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006898 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
6903 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006912Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006913and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006914
6915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006916unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917{
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6920
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
6923 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006925
6926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006927 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929
6930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
6932 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006941Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
6952 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006954
6955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006958
6959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
6961 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
6981 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006985 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
6990 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006997"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006999Return True if all characters in S are digits\n\
7000and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
7002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004{
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006 register const Py_UNICODE *e;
7007
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1 &&
7010 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007011 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007014 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 e = p + PyUnicode_GET_SIZE(self);
7018 for (; p < e; p++) {
7019 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007026"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
7034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035 register const Py_UNICODE *e;
7036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 /* Shortcut for single character strings */
7038 if (PyUnicode_GET_SIZE(self) == 1 &&
7039 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007040 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007042 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007043 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007044 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007045
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 e = p + PyUnicode_GET_SIZE(self);
7047 for (; p < e; p++) {
7048 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007049 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007051 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055"S.join(sequence) -> unicode\n\
7056\n\
7057Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007063 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis18e16552006-02-15 17:27:45 +00007066static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067unicode_length(PyUnicodeObject *self)
7068{
7069 return self->length;
7070}
7071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007073"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007075Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007076done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077
7078static PyObject *
7079unicode_ljust(PyUnicodeObject *self, PyObject *args)
7080{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007081 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007082 Py_UNICODE fillchar = ' ';
7083
Martin v. Löwis412fb672006-04-13 06:34:32 +00007084 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 return NULL;
7086
Tim Peters7a29bd52001-09-12 03:03:31 +00007087 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 Py_INCREF(self);
7089 return (PyObject*) self;
7090 }
7091
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007092 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093}
7094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007095PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096"S.lower() -> unicode\n\
7097\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
7100static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007101unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 return fixup(self, fixlower);
7104}
7105
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106#define LEFTSTRIP 0
7107#define RIGHTSTRIP 1
7108#define BOTHSTRIP 2
7109
7110/* Arrays indexed by above */
7111static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7112
7113#define STRIPNAME(i) (stripformat[i]+3)
7114
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115/* externally visible for str.strip(unicode) */
7116PyObject *
7117_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7118{
7119 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007120 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7123 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007124
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007125 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7126
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127 i = 0;
7128 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007129 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7130 i++;
7131 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007132 }
7133
7134 j = len;
7135 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007136 do {
7137 j--;
7138 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7139 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007140 }
7141
7142 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007143 Py_INCREF(self);
7144 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145 }
7146 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007147 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148}
7149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007154 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007155 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156
7157 i = 0;
7158 if (striptype != RIGHTSTRIP) {
7159 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7160 i++;
7161 }
7162 }
7163
7164 j = len;
7165 if (striptype != LEFTSTRIP) {
7166 do {
7167 j--;
7168 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7169 j++;
7170 }
7171
7172 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7173 Py_INCREF(self);
7174 return (PyObject*)self;
7175 }
7176 else
7177 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007180
7181static PyObject *
7182do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7183{
7184 PyObject *sep = NULL;
7185
7186 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7187 return NULL;
7188
7189 if (sep != NULL && sep != Py_None) {
7190 if (PyUnicode_Check(sep))
7191 return _PyUnicode_XStrip(self, striptype, sep);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007192 else if (PyString_Check(sep)) {
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007193 PyObject *res;
7194 sep = PyUnicode_FromObject(sep);
7195 if (sep==NULL)
7196 return NULL;
7197 res = _PyUnicode_XStrip(self, striptype, sep);
7198 Py_DECREF(sep);
7199 return res;
7200 }
7201 else {
7202 PyErr_Format(PyExc_TypeError,
7203 "%s arg must be None, unicode or str",
7204 STRIPNAME(striptype));
7205 return NULL;
7206 }
7207 }
7208
7209 return do_strip(self, striptype);
7210}
7211
7212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007215\n\
7216Return a copy of the string S with leading and trailing\n\
7217whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007218If chars is given and not None, remove characters in chars instead.\n\
7219If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007220
7221static PyObject *
7222unicode_strip(PyUnicodeObject *self, PyObject *args)
7223{
7224 if (PyTuple_GET_SIZE(args) == 0)
7225 return do_strip(self, BOTHSTRIP); /* Common case */
7226 else
7227 return do_argstrip(self, BOTHSTRIP, args);
7228}
7229
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007232"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007233\n\
7234Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007235If chars is given and not None, remove characters in chars instead.\n\
7236If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007237
7238static PyObject *
7239unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7240{
7241 if (PyTuple_GET_SIZE(args) == 0)
7242 return do_strip(self, LEFTSTRIP); /* Common case */
7243 else
7244 return do_argstrip(self, LEFTSTRIP, args);
7245}
7246
7247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007248PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007249"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250\n\
7251Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007252If chars is given and not None, remove characters in chars instead.\n\
7253If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007254
7255static PyObject *
7256unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7257{
7258 if (PyTuple_GET_SIZE(args) == 0)
7259 return do_strip(self, RIGHTSTRIP); /* Common case */
7260 else
7261 return do_argstrip(self, RIGHTSTRIP, args);
7262}
7263
7264
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267{
7268 PyUnicodeObject *u;
7269 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007271 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273 if (len < 0)
7274 len = 0;
7275
Tim Peters7a29bd52001-09-12 03:03:31 +00007276 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 /* no repeat, return original string */
7278 Py_INCREF(str);
7279 return (PyObject*) str;
7280 }
Tim Peters8f422462000-09-09 06:13:41 +00007281
7282 /* ensure # of chars needed doesn't overflow int and # of bytes
7283 * needed doesn't overflow size_t
7284 */
7285 nchars = len * str->length;
7286 if (len && nchars / len != str->length) {
7287 PyErr_SetString(PyExc_OverflowError,
7288 "repeated string is too long");
7289 return NULL;
7290 }
7291 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7292 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7293 PyErr_SetString(PyExc_OverflowError,
7294 "repeated string is too long");
7295 return NULL;
7296 }
7297 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 if (!u)
7299 return NULL;
7300
7301 p = u->str;
7302
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007303 if (str->length == 1 && len > 0) {
7304 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007305 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007306 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007307 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007308 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007309 done = str->length;
7310 }
7311 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007312 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007313 Py_UNICODE_COPY(p+done, p, n);
7314 done += n;
7315 }
7316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
7318 return (PyObject*) u;
7319}
7320
7321PyObject *PyUnicode_Replace(PyObject *obj,
7322 PyObject *subobj,
7323 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007324 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 PyObject *self;
7327 PyObject *str1;
7328 PyObject *str2;
7329 PyObject *result;
7330
7331 self = PyUnicode_FromObject(obj);
7332 if (self == NULL)
7333 return NULL;
7334 str1 = PyUnicode_FromObject(subobj);
7335 if (str1 == NULL) {
7336 Py_DECREF(self);
7337 return NULL;
7338 }
7339 str2 = PyUnicode_FromObject(replobj);
7340 if (str2 == NULL) {
7341 Py_DECREF(self);
7342 Py_DECREF(str1);
7343 return NULL;
7344 }
Tim Petersced69f82003-09-16 20:30:58 +00007345 result = replace((PyUnicodeObject *)self,
7346 (PyUnicodeObject *)str1,
7347 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 maxcount);
7349 Py_DECREF(self);
7350 Py_DECREF(str1);
7351 Py_DECREF(str2);
7352 return result;
7353}
7354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355PyDoc_STRVAR(replace__doc__,
Georg Brandl30fadc12008-05-30 07:54:16 +00007356"S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357\n\
7358Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007359old replaced by new. If the optional argument count is\n\
7360given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362static PyObject*
7363unicode_replace(PyUnicodeObject *self, PyObject *args)
7364{
7365 PyUnicodeObject *str1;
7366 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007367 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 PyObject *result;
7369
Martin v. Löwis18e16552006-02-15 17:27:45 +00007370 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 return NULL;
7372 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7373 if (str1 == NULL)
7374 return NULL;
7375 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007376 if (str2 == NULL) {
7377 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380
7381 result = replace(self, str1, str2, maxcount);
7382
7383 Py_DECREF(str1);
7384 Py_DECREF(str2);
7385 return result;
7386}
7387
7388static
7389PyObject *unicode_repr(PyObject *unicode)
7390{
7391 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7392 PyUnicode_GET_SIZE(unicode),
7393 1);
7394}
7395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397"S.rfind(sub [,start [,end]]) -> int\n\
7398\n\
7399Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007400such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401arguments start and end are interpreted as in slice notation.\n\
7402\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007403Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
7405static PyObject *
7406unicode_rfind(PyUnicodeObject *self, PyObject *args)
7407{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007408 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007409 Py_ssize_t start;
7410 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007411 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Facundo Batista57d56692007-11-16 18:04:14 +00007413 if (!_ParseTupleFinds(args, &substring, &start, &end))
7414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007416 result = stringlib_rfind_slice(
7417 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7418 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7419 start, end
7420 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
7422 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007423
7424 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425}
7426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007427PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428"S.rindex(sub [,start [,end]]) -> int\n\
7429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431
7432static PyObject *
7433unicode_rindex(PyUnicodeObject *self, PyObject *args)
7434{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007435 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007436 Py_ssize_t start;
7437 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007438 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
Facundo Batista57d56692007-11-16 18:04:14 +00007440 if (!_ParseTupleFinds(args, &substring, &start, &end))
7441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007443 result = stringlib_rfind_slice(
7444 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7445 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7446 start, end
7447 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007450
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 if (result < 0) {
7452 PyErr_SetString(PyExc_ValueError, "substring not found");
7453 return NULL;
7454 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007459"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007461Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007462done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject *
7465unicode_rjust(PyUnicodeObject *self, PyObject *args)
7466{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007467 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007468 Py_UNICODE fillchar = ' ';
7469
Martin v. Löwis412fb672006-04-13 06:34:32 +00007470 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 return NULL;
7472
Tim Peters7a29bd52001-09-12 03:03:31 +00007473 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 Py_INCREF(self);
7475 return (PyObject*) self;
7476 }
7477
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007478 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007482unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
7484 /* standard clamping */
7485 if (start < 0)
7486 start = 0;
7487 if (end < 0)
7488 end = 0;
7489 if (end > self->length)
7490 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007491 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 /* full slice, return original string */
7493 Py_INCREF(self);
7494 return (PyObject*) self;
7495 }
7496 if (start > end)
7497 start = end;
7498 /* copy slice */
7499 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7500 end - start);
7501}
7502
7503PyObject *PyUnicode_Split(PyObject *s,
7504 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506{
7507 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 s = PyUnicode_FromObject(s);
7510 if (s == NULL)
7511 return NULL;
7512 if (sep != NULL) {
7513 sep = PyUnicode_FromObject(sep);
7514 if (sep == NULL) {
7515 Py_DECREF(s);
7516 return NULL;
7517 }
7518 }
7519
7520 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7521
7522 Py_DECREF(s);
7523 Py_XDECREF(sep);
7524 return result;
7525}
7526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528"S.split([sep [,maxsplit]]) -> list of strings\n\
7529\n\
7530Return a list of the words in S, using sep as the\n\
7531delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007532splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007533whitespace string is a separator and empty strings are\n\
7534removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536static PyObject*
7537unicode_split(PyUnicodeObject *self, PyObject *args)
7538{
7539 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 return NULL;
7544
7545 if (substring == Py_None)
7546 return split(self, NULL, maxcount);
7547 else if (PyUnicode_Check(substring))
7548 return split(self, (PyUnicodeObject *)substring, maxcount);
7549 else
7550 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7551}
7552
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553PyObject *
7554PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7555{
7556 PyObject* str_obj;
7557 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007558 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007559
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007560 str_obj = PyUnicode_FromObject(str_in);
7561 if (!str_obj)
7562 return NULL;
7563 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007564 if (!sep_obj) {
7565 Py_DECREF(str_obj);
7566 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007567 }
7568
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007569 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007570 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7571 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7572 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007573
Fredrik Lundhb9479482006-05-26 17:22:38 +00007574 Py_DECREF(sep_obj);
7575 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007576
7577 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007578}
7579
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007580
7581PyObject *
7582PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7583{
7584 PyObject* str_obj;
7585 PyObject* sep_obj;
7586 PyObject* out;
7587
7588 str_obj = PyUnicode_FromObject(str_in);
7589 if (!str_obj)
7590 return NULL;
7591 sep_obj = PyUnicode_FromObject(sep_in);
7592 if (!sep_obj) {
7593 Py_DECREF(str_obj);
7594 return NULL;
7595 }
7596
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007597 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007598 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7599 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7600 );
7601
7602 Py_DECREF(sep_obj);
7603 Py_DECREF(str_obj);
7604
7605 return out;
7606}
7607
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007608PyDoc_STRVAR(partition__doc__,
7609"S.partition(sep) -> (head, sep, tail)\n\
7610\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007611Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007612the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007613found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007614
7615static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007616unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007617{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007618 return PyUnicode_Partition((PyObject *)self, separator);
7619}
7620
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007621PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007622"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007623\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007624Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007625the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007626separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007627
7628static PyObject*
7629unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7630{
7631 return PyUnicode_RPartition((PyObject *)self, separator);
7632}
7633
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007634PyObject *PyUnicode_RSplit(PyObject *s,
7635 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637{
7638 PyObject *result;
7639
7640 s = PyUnicode_FromObject(s);
7641 if (s == NULL)
7642 return NULL;
7643 if (sep != NULL) {
7644 sep = PyUnicode_FromObject(sep);
7645 if (sep == NULL) {
7646 Py_DECREF(s);
7647 return NULL;
7648 }
7649 }
7650
7651 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7652
7653 Py_DECREF(s);
7654 Py_XDECREF(sep);
7655 return result;
7656}
7657
7658PyDoc_STRVAR(rsplit__doc__,
7659"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7660\n\
7661Return a list of the words in S, using sep as the\n\
7662delimiter string, starting at the end of the string and\n\
7663working to the front. If maxsplit is given, at most maxsplit\n\
7664splits are done. If sep is not specified, any whitespace string\n\
7665is a separator.");
7666
7667static PyObject*
7668unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7669{
7670 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007672
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007674 return NULL;
7675
7676 if (substring == Py_None)
7677 return rsplit(self, NULL, maxcount);
7678 else if (PyUnicode_Check(substring))
7679 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7680 else
7681 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7682}
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(splitlines__doc__,
Amaury Forgeot d'Arc2a1fd052008-11-29 02:03:32 +00007685"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686\n\
7687Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007688Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject*
7692unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7693{
Guido van Rossum86662912000-04-11 15:38:46 +00007694 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Guido van Rossum86662912000-04-11 15:38:46 +00007696 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 return NULL;
7698
Guido van Rossum86662912000-04-11 15:38:46 +00007699 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700}
7701
7702static
7703PyObject *unicode_str(PyUnicodeObject *self)
7704{
Fred Drakee4315f52000-05-09 19:53:39 +00007705 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706}
7707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709"S.swapcase() -> unicode\n\
7710\n\
7711Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713
7714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007715unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 return fixup(self, fixswapcase);
7718}
7719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721"S.translate(table) -> unicode\n\
7722\n\
7723Return a copy of the string S, where all characters have been mapped\n\
7724through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007725Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7726Unmapped characters are left untouched. Characters mapped to None\n\
7727are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007730unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
Tim Petersced69f82003-09-16 20:30:58 +00007732 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007734 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 "ignore");
7736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739"S.upper() -> unicode\n\
7740\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
7743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007744unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 return fixup(self, fixupper);
7747}
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750"S.zfill(width) -> unicode\n\
7751\n\
Georg Brandl98064072008-09-09 19:26:00 +00007752Pad a numeric string S with zeros on the left, to fill a field\n\
7753of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
7755static PyObject *
7756unicode_zfill(PyUnicodeObject *self, PyObject *args)
7757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007758 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 PyUnicodeObject *u;
7760
Martin v. Löwis18e16552006-02-15 17:27:45 +00007761 Py_ssize_t width;
7762 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 return NULL;
7764
7765 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007766 if (PyUnicode_CheckExact(self)) {
7767 Py_INCREF(self);
7768 return (PyObject*) self;
7769 }
7770 else
7771 return PyUnicode_FromUnicode(
7772 PyUnicode_AS_UNICODE(self),
7773 PyUnicode_GET_SIZE(self)
7774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 }
7776
7777 fill = width - self->length;
7778
7779 u = pad(self, fill, 0, '0');
7780
Walter Dörwald068325e2002-04-15 13:36:47 +00007781 if (u == NULL)
7782 return NULL;
7783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 if (u->str[fill] == '+' || u->str[fill] == '-') {
7785 /* move sign to beginning of string */
7786 u->str[0] = u->str[fill];
7787 u->str[fill] = '0';
7788 }
7789
7790 return (PyObject*) u;
7791}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793#if 0
7794static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007795free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007797 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798}
7799#endif
7800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007802"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007804Return True if S starts with the specified prefix, False otherwise.\n\
7805With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007806With optional end, stop comparing S at that position.\n\
7807prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809static PyObject *
7810unicode_startswith(PyUnicodeObject *self,
7811 PyObject *args)
7812{
Georg Brandl24250812006-06-09 18:45:48 +00007813 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007815 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007816 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007817 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818
Georg Brandl24250812006-06-09 18:45:48 +00007819 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007822 if (PyTuple_Check(subobj)) {
7823 Py_ssize_t i;
7824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7826 PyTuple_GET_ITEM(subobj, i));
7827 if (substring == NULL)
7828 return NULL;
7829 result = tailmatch(self, substring, start, end, -1);
7830 Py_DECREF(substring);
7831 if (result) {
7832 Py_RETURN_TRUE;
7833 }
7834 }
7835 /* nothing matched */
7836 Py_RETURN_FALSE;
7837 }
7838 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007840 return NULL;
7841 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007843 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844}
7845
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007848"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007850Return True if S ends with the specified suffix, False otherwise.\n\
7851With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007852With optional end, stop comparing S at that position.\n\
7853suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854
7855static PyObject *
7856unicode_endswith(PyUnicodeObject *self,
7857 PyObject *args)
7858{
Georg Brandl24250812006-06-09 18:45:48 +00007859 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007861 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007862 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007863 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
Georg Brandl24250812006-06-09 18:45:48 +00007865 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7866 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007868 if (PyTuple_Check(subobj)) {
7869 Py_ssize_t i;
7870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7872 PyTuple_GET_ITEM(subobj, i));
7873 if (substring == NULL)
7874 return NULL;
7875 result = tailmatch(self, substring, start, end, +1);
7876 Py_DECREF(substring);
7877 if (result) {
7878 Py_RETURN_TRUE;
7879 }
7880 }
7881 Py_RETURN_FALSE;
7882 }
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886
Georg Brandl24250812006-06-09 18:45:48 +00007887 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007889 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890}
7891
7892
Eric Smitha9f7d622008-02-17 19:46:49 +00007893/* Implements do_string_format, which is unicode because of stringlib */
7894#include "stringlib/string_format.h"
7895
7896PyDoc_STRVAR(format__doc__,
7897"S.format(*args, **kwargs) -> unicode\n\
7898\n\
7899");
7900
Eric Smithdc13b792008-05-30 18:10:04 +00007901static PyObject *
7902unicode__format__(PyObject *self, PyObject *args)
7903{
7904 PyObject *format_spec;
7905 PyObject *result = NULL;
7906 PyObject *tmp = NULL;
7907
7908 /* If 2.x, convert format_spec to the same type as value */
7909 /* This is to allow things like u''.format('') */
7910 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7911 goto done;
7912 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7913 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7914 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7915 goto done;
7916 }
7917 tmp = PyObject_Unicode(format_spec);
7918 if (tmp == NULL)
7919 goto done;
7920 format_spec = tmp;
7921
7922 result = _PyUnicode_FormatAdvanced(self,
7923 PyUnicode_AS_UNICODE(format_spec),
7924 PyUnicode_GET_SIZE(format_spec));
7925done:
7926 Py_XDECREF(tmp);
7927 return result;
7928}
7929
Eric Smitha9f7d622008-02-17 19:46:49 +00007930PyDoc_STRVAR(p_format__doc__,
7931"S.__format__(format_spec) -> unicode\n\
7932\n\
7933");
7934
Robert Schuppenies901c9972008-06-10 10:10:31 +00007935static PyObject *
7936unicode__sizeof__(PyUnicodeObject *v)
7937{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007938 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7939 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007940}
7941
7942PyDoc_STRVAR(sizeof__doc__,
7943"S.__sizeof__() -> size of S in memory, in bytes\n\
7944\n\
7945");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007946
7947static PyObject *
7948unicode_getnewargs(PyUnicodeObject *v)
7949{
7950 return Py_BuildValue("(u#)", v->str, v->length);
7951}
7952
7953
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954static PyMethodDef unicode_methods[] = {
7955
7956 /* Order is according to common usage: often used methods should
7957 appear first, since lookup is done sequentially. */
7958
Georg Brandlecdc0a92006-03-30 12:19:07 +00007959 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007960 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7961 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007962 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7964 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7965 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7966 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7967 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7968 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7969 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007970 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007971 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7972 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7973 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007974 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007975 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007976/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7977 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7978 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7979 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007980 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007981 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007983 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7985 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7986 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7987 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7988 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7989 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7990 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7991 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7992 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7993 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7994 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7995 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7996 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7997 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007998 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007999 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8000 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8001 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8002 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008003 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008004#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008005 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006#endif
8007
8008#if 0
8009 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008010 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011#endif
8012
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008013 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 {NULL, NULL}
8015};
8016
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008017static PyObject *
8018unicode_mod(PyObject *v, PyObject *w)
8019{
8020 if (!PyUnicode_Check(v)) {
8021 Py_INCREF(Py_NotImplemented);
8022 return Py_NotImplemented;
8023 }
8024 return PyUnicode_Format(v, w);
8025}
8026
8027static PyNumberMethods unicode_as_number = {
8028 0, /*nb_add*/
8029 0, /*nb_subtract*/
8030 0, /*nb_multiply*/
8031 0, /*nb_divide*/
8032 unicode_mod, /*nb_remainder*/
8033};
8034
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008036 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00008037 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8039 (ssizeargfunc) unicode_getitem, /* sq_item */
8040 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 0, /* sq_ass_item */
8042 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00008043 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044};
8045
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008046static PyObject*
8047unicode_subscript(PyUnicodeObject* self, PyObject* item)
8048{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008049 if (PyIndex_Check(item)) {
8050 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051 if (i == -1 && PyErr_Occurred())
8052 return NULL;
8053 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008054 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008055 return unicode_getitem(self, i);
8056 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008058 Py_UNICODE* source_buf;
8059 Py_UNICODE* result_buf;
8060 PyObject* result;
8061
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008062 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008063 &start, &stop, &step, &slicelength) < 0) {
8064 return NULL;
8065 }
8066
8067 if (slicelength <= 0) {
8068 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008069 } else if (start == 0 && step == 1 && slicelength == self->length &&
8070 PyUnicode_CheckExact(self)) {
8071 Py_INCREF(self);
8072 return (PyObject *)self;
8073 } else if (step == 1) {
8074 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 } else {
8076 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008077 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8078 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008079
8080 if (result_buf == NULL)
8081 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008082
8083 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8084 result_buf[i] = source_buf[cur];
8085 }
Tim Petersced69f82003-09-16 20:30:58 +00008086
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008088 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008089 return result;
8090 }
8091 } else {
8092 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8093 return NULL;
8094 }
8095}
8096
8097static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008099 (binaryfunc)unicode_subscript, /* mp_subscript */
8100 (objobjargproc)0, /* mp_ass_subscript */
8101};
8102
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 const void **ptr)
8107{
8108 if (index != 0) {
8109 PyErr_SetString(PyExc_SystemError,
8110 "accessing non-existent unicode segment");
8111 return -1;
8112 }
8113 *ptr = (void *) self->str;
8114 return PyUnicode_GET_DATA_SIZE(self);
8115}
8116
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117static Py_ssize_t
8118unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 const void **ptr)
8120{
8121 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008122 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return -1;
8124}
8125
8126static int
8127unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129{
8130 if (lenp)
8131 *lenp = PyUnicode_GET_DATA_SIZE(self);
8132 return 1;
8133}
8134
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008135static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008137 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 const void **ptr)
8139{
8140 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008141
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 if (index != 0) {
8143 PyErr_SetString(PyExc_SystemError,
8144 "accessing non-existent unicode segment");
8145 return -1;
8146 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008147 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 if (str == NULL)
8149 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008150 *ptr = (void *) PyString_AS_STRING(str);
8151 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152}
8153
8154/* Helpers for PyUnicode_Format() */
8155
8156static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 if (argidx < arglen) {
8161 (*p_argidx)++;
8162 if (arglen < 0)
8163 return args;
8164 else
8165 return PyTuple_GetItem(args, argidx);
8166 }
8167 PyErr_SetString(PyExc_TypeError,
8168 "not enough arguments for format string");
8169 return NULL;
8170}
8171
8172#define F_LJUST (1<<0)
8173#define F_SIGN (1<<1)
8174#define F_BLANK (1<<2)
8175#define F_ALT (1<<3)
8176#define F_ZERO (1<<4)
8177
Martin v. Löwis18e16552006-02-15 17:27:45 +00008178static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008179strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181 register Py_ssize_t i;
8182 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 for (i = len - 1; i >= 0; i--)
8184 buffer[i] = (Py_UNICODE) charbuffer[i];
8185
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 return len;
8187}
8188
Neal Norwitzfc76d632006-01-10 06:03:13 +00008189static int
8190doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8191{
Tim Peters15231542006-02-16 01:08:01 +00008192 Py_ssize_t result;
8193
Neal Norwitzfc76d632006-01-10 06:03:13 +00008194 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008195 result = strtounicode(buffer, (char *)buffer);
8196 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008197}
8198
8199static int
8200longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8201{
Tim Peters15231542006-02-16 01:08:01 +00008202 Py_ssize_t result;
8203
Neal Norwitzfc76d632006-01-10 06:03:13 +00008204 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008205 result = strtounicode(buffer, (char *)buffer);
8206 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008207}
8208
Guido van Rossum078151d2002-08-11 04:24:12 +00008209/* XXX To save some code duplication, formatfloat/long/int could have been
8210 shared with stringobject.c, converting from 8-bit to Unicode after the
8211 formatting is done. */
8212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213static int
8214formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008215 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 int flags,
8217 int prec,
8218 int type,
8219 PyObject *v)
8220{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008221 /* fmt = '%#.' + `prec` + `type`
8222 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 char fmt[20];
8224 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008225
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 x = PyFloat_AsDouble(v);
8227 if (x == -1.0 && PyErr_Occurred())
8228 return -1;
8229 if (prec < 0)
8230 prec = 6;
Eric Smithd6c393a2008-07-17 19:49:47 +00008231 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8232 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008233 /* Worst case length calc to ensure no buffer overrun:
8234
8235 'g' formats:
8236 fmt = %#.<prec>g
8237 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8238 for any double rep.)
8239 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8240
8241 'f' formats:
8242 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8243 len = 1 + 50 + 1 + prec = 52 + prec
8244
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008246 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008247
8248 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008249 if (((type == 'g' || type == 'G') &&
8250 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smithd6c393a2008-07-17 19:49:47 +00008251 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008252 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008253 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008254 return -1;
8255 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008256 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8257 (flags&F_ALT) ? "#" : "",
8258 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008259 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260}
8261
Tim Peters38fd5b62000-09-21 05:43:11 +00008262static PyObject*
8263formatlong(PyObject *val, int flags, int prec, int type)
8264{
8265 char *buf;
8266 int i, len;
8267 PyObject *str; /* temporary string object. */
8268 PyUnicodeObject *result;
8269
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008270 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008271 if (!str)
8272 return NULL;
8273 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008274 if (!result) {
8275 Py_DECREF(str);
8276 return NULL;
8277 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008278 for (i = 0; i < len; i++)
8279 result->str[i] = buf[i];
8280 result->str[len] = 0;
8281 Py_DECREF(str);
8282 return (PyObject*)result;
8283}
8284
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285static int
8286formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008287 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 int flags,
8289 int prec,
8290 int type,
8291 PyObject *v)
8292{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008293 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008294 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8295 * + 1 + 1
8296 * = 24
8297 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008298 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008299 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 long x;
8301
8302 x = PyInt_AsLong(v);
8303 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008304 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 if (x < 0 && type == 'u') {
8306 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008307 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8309 sign = "-";
8310 else
8311 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313 prec = 1;
8314
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008315 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8316 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008317 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008318 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008319 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008320 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008321 return -1;
8322 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323
8324 if ((flags & F_ALT) &&
8325 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008326 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008327 * of issues that cause pain:
8328 * - when 0 is being converted, the C standard leaves off
8329 * the '0x' or '0X', which is inconsistent with other
8330 * %#x/%#X conversions and inconsistent with Python's
8331 * hex() function
8332 * - there are platforms that violate the standard and
8333 * convert 0 with the '0x' or '0X'
8334 * (Metrowerks, Compaq Tru64)
8335 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008336 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008337 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008338 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 * We can achieve the desired consistency by inserting our
8340 * own '0x' or '0X' prefix, and substituting %x/%X in place
8341 * of %#x/%#X.
8342 *
8343 * Note that this is the same approach as used in
8344 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008345 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008346 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8347 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008348 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008349 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008350 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8351 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008352 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008353 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008355 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008356 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008357 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358}
8359
8360static int
8361formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008362 size_t buflen,
8363 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008365 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008366 if (PyUnicode_Check(v)) {
8367 if (PyUnicode_GET_SIZE(v) != 1)
8368 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008372 else if (PyString_Check(v)) {
8373 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008374 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008375 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377
8378 else {
8379 /* Integer input truncated to a character */
8380 long x;
8381 x = PyInt_AsLong(v);
8382 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008383 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008384#ifdef Py_UNICODE_WIDE
8385 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008386 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008387 "%c arg not in range(0x110000) "
8388 "(wide Python build)");
8389 return -1;
8390 }
8391#else
8392 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008393 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008394 "%c arg not in range(0x10000) "
8395 "(narrow Python build)");
8396 return -1;
8397 }
8398#endif
8399 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 }
8401 buf[1] = '\0';
8402 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008403
8404 onError:
8405 PyErr_SetString(PyExc_TypeError,
8406 "%c requires int or char");
8407 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408}
8409
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008410/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8411
8412 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8413 chars are formatted. XXX This is a magic number. Each formatting
8414 routine does bounds checking to ensure no overflow, but a better
8415 solution may be to malloc a buffer of appropriate size for each
8416 format. For now, the current solution is sufficient.
8417*/
8418#define FORMATBUFLEN (size_t)120
8419
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420PyObject *PyUnicode_Format(PyObject *format,
8421 PyObject *args)
8422{
8423 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 int args_owned = 0;
8426 PyUnicodeObject *result = NULL;
8427 PyObject *dict = NULL;
8428 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 if (format == NULL || args == NULL) {
8431 PyErr_BadInternalCall();
8432 return NULL;
8433 }
8434 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008435 if (uformat == NULL)
8436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 fmt = PyUnicode_AS_UNICODE(uformat);
8438 fmtcnt = PyUnicode_GET_SIZE(uformat);
8439
8440 reslen = rescnt = fmtcnt + 100;
8441 result = _PyUnicode_New(reslen);
8442 if (result == NULL)
8443 goto onError;
8444 res = PyUnicode_AS_UNICODE(result);
8445
8446 if (PyTuple_Check(args)) {
8447 arglen = PyTuple_Size(args);
8448 argidx = 0;
8449 }
8450 else {
8451 arglen = -1;
8452 argidx = -2;
8453 }
Christian Heimese93237d2007-12-19 02:37:44 +00008454 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008455 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 dict = args;
8457
8458 while (--fmtcnt >= 0) {
8459 if (*fmt != '%') {
8460 if (--rescnt < 0) {
8461 rescnt = fmtcnt + 100;
8462 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008463 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008464 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8466 --rescnt;
8467 }
8468 *res++ = *fmt++;
8469 }
8470 else {
8471 /* Got a format specifier */
8472 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 Py_UNICODE c = '\0';
8476 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008477 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 PyObject *v = NULL;
8479 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008480 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008482 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008483 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485 fmt++;
8486 if (*fmt == '(') {
8487 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008488 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 PyObject *key;
8490 int pcount = 1;
8491
8492 if (dict == NULL) {
8493 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008494 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 goto onError;
8496 }
8497 ++fmt;
8498 --fmtcnt;
8499 keystart = fmt;
8500 /* Skip over balanced parentheses */
8501 while (pcount > 0 && --fmtcnt >= 0) {
8502 if (*fmt == ')')
8503 --pcount;
8504 else if (*fmt == '(')
8505 ++pcount;
8506 fmt++;
8507 }
8508 keylen = fmt - keystart - 1;
8509 if (fmtcnt < 0 || pcount > 0) {
8510 PyErr_SetString(PyExc_ValueError,
8511 "incomplete format key");
8512 goto onError;
8513 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008514#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008515 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 then looked up since Python uses strings to hold
8517 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008518 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 key = PyUnicode_EncodeUTF8(keystart,
8520 keylen,
8521 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008522#else
8523 key = PyUnicode_FromUnicode(keystart, keylen);
8524#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 if (key == NULL)
8526 goto onError;
8527 if (args_owned) {
8528 Py_DECREF(args);
8529 args_owned = 0;
8530 }
8531 args = PyObject_GetItem(dict, key);
8532 Py_DECREF(key);
8533 if (args == NULL) {
8534 goto onError;
8535 }
8536 args_owned = 1;
8537 arglen = -1;
8538 argidx = -2;
8539 }
8540 while (--fmtcnt >= 0) {
8541 switch (c = *fmt++) {
8542 case '-': flags |= F_LJUST; continue;
8543 case '+': flags |= F_SIGN; continue;
8544 case ' ': flags |= F_BLANK; continue;
8545 case '#': flags |= F_ALT; continue;
8546 case '0': flags |= F_ZERO; continue;
8547 }
8548 break;
8549 }
8550 if (c == '*') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 if (!PyInt_Check(v)) {
8555 PyErr_SetString(PyExc_TypeError,
8556 "* wants int");
8557 goto onError;
8558 }
8559 width = PyInt_AsLong(v);
8560 if (width < 0) {
8561 flags |= F_LJUST;
8562 width = -width;
8563 }
8564 if (--fmtcnt >= 0)
8565 c = *fmt++;
8566 }
8567 else if (c >= '0' && c <= '9') {
8568 width = c - '0';
8569 while (--fmtcnt >= 0) {
8570 c = *fmt++;
8571 if (c < '0' || c > '9')
8572 break;
8573 if ((width*10) / 10 != width) {
8574 PyErr_SetString(PyExc_ValueError,
8575 "width too big");
8576 goto onError;
8577 }
8578 width = width*10 + (c - '0');
8579 }
8580 }
8581 if (c == '.') {
8582 prec = 0;
8583 if (--fmtcnt >= 0)
8584 c = *fmt++;
8585 if (c == '*') {
8586 v = getnextarg(args, arglen, &argidx);
8587 if (v == NULL)
8588 goto onError;
8589 if (!PyInt_Check(v)) {
8590 PyErr_SetString(PyExc_TypeError,
8591 "* wants int");
8592 goto onError;
8593 }
8594 prec = PyInt_AsLong(v);
8595 if (prec < 0)
8596 prec = 0;
8597 if (--fmtcnt >= 0)
8598 c = *fmt++;
8599 }
8600 else if (c >= '0' && c <= '9') {
8601 prec = c - '0';
8602 while (--fmtcnt >= 0) {
8603 c = Py_CHARMASK(*fmt++);
8604 if (c < '0' || c > '9')
8605 break;
8606 if ((prec*10) / 10 != prec) {
8607 PyErr_SetString(PyExc_ValueError,
8608 "prec too big");
8609 goto onError;
8610 }
8611 prec = prec*10 + (c - '0');
8612 }
8613 }
8614 } /* prec */
8615 if (fmtcnt >= 0) {
8616 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 if (--fmtcnt >= 0)
8618 c = *fmt++;
8619 }
8620 }
8621 if (fmtcnt < 0) {
8622 PyErr_SetString(PyExc_ValueError,
8623 "incomplete format");
8624 goto onError;
8625 }
8626 if (c != '%') {
8627 v = getnextarg(args, arglen, &argidx);
8628 if (v == NULL)
8629 goto onError;
8630 }
8631 sign = 0;
8632 fill = ' ';
8633 switch (c) {
8634
8635 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008636 pbuf = formatbuf;
8637 /* presume that buffer length is at least 1 */
8638 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 len = 1;
8640 break;
8641
8642 case 's':
8643 case 'r':
8644 if (PyUnicode_Check(v) && c == 's') {
8645 temp = v;
8646 Py_INCREF(temp);
8647 }
8648 else {
8649 PyObject *unicode;
8650 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008651 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 else
8653 temp = PyObject_Repr(v);
8654 if (temp == NULL)
8655 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008656 if (PyUnicode_Check(temp))
8657 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008658 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008659 /* convert to string to Unicode */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008660 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8661 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008662 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008664 Py_DECREF(temp);
8665 temp = unicode;
8666 if (temp == NULL)
8667 goto onError;
8668 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008669 else {
8670 Py_DECREF(temp);
8671 PyErr_SetString(PyExc_TypeError,
8672 "%s argument has non-string str()");
8673 goto onError;
8674 }
8675 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008676 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 len = PyUnicode_GET_SIZE(temp);
8678 if (prec >= 0 && len > prec)
8679 len = prec;
8680 break;
8681
8682 case 'i':
8683 case 'd':
8684 case 'u':
8685 case 'o':
8686 case 'x':
8687 case 'X':
8688 if (c == 'i')
8689 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008690 isnumok = 0;
8691 if (PyNumber_Check(v)) {
8692 PyObject *iobj=NULL;
8693
8694 if (PyInt_Check(v) || (PyLong_Check(v))) {
8695 iobj = v;
8696 Py_INCREF(iobj);
8697 }
8698 else {
8699 iobj = PyNumber_Int(v);
8700 if (iobj==NULL) iobj = PyNumber_Long(v);
8701 }
8702 if (iobj!=NULL) {
8703 if (PyInt_Check(iobj)) {
8704 isnumok = 1;
8705 pbuf = formatbuf;
8706 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8707 flags, prec, c, iobj);
8708 Py_DECREF(iobj);
8709 if (len < 0)
8710 goto onError;
8711 sign = 1;
8712 }
8713 else if (PyLong_Check(iobj)) {
8714 isnumok = 1;
8715 temp = formatlong(iobj, flags, prec, c);
8716 Py_DECREF(iobj);
8717 if (!temp)
8718 goto onError;
8719 pbuf = PyUnicode_AS_UNICODE(temp);
8720 len = PyUnicode_GET_SIZE(temp);
8721 sign = 1;
8722 }
8723 else {
8724 Py_DECREF(iobj);
8725 }
8726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008728 if (!isnumok) {
8729 PyErr_Format(PyExc_TypeError,
8730 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008731 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008732 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008733 }
8734 if (flags & F_ZERO)
8735 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 break;
8737
8738 case 'e':
8739 case 'E':
8740 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008741 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 case 'g':
8743 case 'G':
Eric Smithd6c393a2008-07-17 19:49:47 +00008744 if (c == 'F')
8745 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008746 pbuf = formatbuf;
8747 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8748 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 if (len < 0)
8750 goto onError;
8751 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008752 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 fill = '0';
8754 break;
8755
8756 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008757 pbuf = formatbuf;
8758 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 if (len < 0)
8760 goto onError;
8761 break;
8762
8763 default:
8764 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008765 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008766 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008767 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008768 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008769 (Py_ssize_t)(fmt - 1 -
8770 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 goto onError;
8772 }
8773 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008774 if (*pbuf == '-' || *pbuf == '+') {
8775 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 len--;
8777 }
8778 else if (flags & F_SIGN)
8779 sign = '+';
8780 else if (flags & F_BLANK)
8781 sign = ' ';
8782 else
8783 sign = 0;
8784 }
8785 if (width < len)
8786 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008787 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 reslen -= rescnt;
8789 rescnt = width + fmtcnt + 100;
8790 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008791 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008792 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008793 PyErr_NoMemory();
8794 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008795 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008796 if (_PyUnicode_Resize(&result, reslen) < 0) {
8797 Py_XDECREF(temp);
8798 goto onError;
8799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 res = PyUnicode_AS_UNICODE(result)
8801 + reslen - rescnt;
8802 }
8803 if (sign) {
8804 if (fill != ' ')
8805 *res++ = sign;
8806 rescnt--;
8807 if (width > len)
8808 width--;
8809 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008810 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8811 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008812 assert(pbuf[1] == c);
8813 if (fill != ' ') {
8814 *res++ = *pbuf++;
8815 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008816 }
Tim Petersfff53252001-04-12 18:38:48 +00008817 rescnt -= 2;
8818 width -= 2;
8819 if (width < 0)
8820 width = 0;
8821 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 if (width > len && !(flags & F_LJUST)) {
8824 do {
8825 --rescnt;
8826 *res++ = fill;
8827 } while (--width > len);
8828 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008829 if (fill == ' ') {
8830 if (sign)
8831 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008832 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008833 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008834 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008835 *res++ = *pbuf++;
8836 *res++ = *pbuf++;
8837 }
8838 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008839 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 res += len;
8841 rescnt -= len;
8842 while (--width >= len) {
8843 --rescnt;
8844 *res++ = ' ';
8845 }
8846 if (dict && (argidx < arglen) && c != '%') {
8847 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008848 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008849 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 goto onError;
8851 }
8852 Py_XDECREF(temp);
8853 } /* '%' */
8854 } /* until end */
8855 if (argidx < arglen && !dict) {
8856 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008857 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 goto onError;
8859 }
8860
Thomas Woutersa96affe2006-03-12 00:29:36 +00008861 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8862 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 if (args_owned) {
8864 Py_DECREF(args);
8865 }
8866 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 return (PyObject *)result;
8868
8869 onError:
8870 Py_XDECREF(result);
8871 Py_DECREF(uformat);
8872 if (args_owned) {
8873 Py_DECREF(args);
8874 }
8875 return NULL;
8876}
8877
8878static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008879 (readbufferproc) unicode_buffer_getreadbuf,
8880 (writebufferproc) unicode_buffer_getwritebuf,
8881 (segcountproc) unicode_buffer_getsegcount,
8882 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883};
8884
Jeremy Hylton938ace62002-07-17 16:30:39 +00008885static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008886unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8887
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888static PyObject *
8889unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8890{
8891 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008892 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008893 char *encoding = NULL;
8894 char *errors = NULL;
8895
Guido van Rossume023fe02001-08-30 03:12:59 +00008896 if (type != &PyUnicode_Type)
8897 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008898 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8899 kwlist, &x, &encoding, &errors))
8900 return NULL;
8901 if (x == NULL)
8902 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008903 if (encoding == NULL && errors == NULL)
8904 return PyObject_Unicode(x);
8905 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906 return PyUnicode_FromEncodedObject(x, encoding, errors);
8907}
8908
Guido van Rossume023fe02001-08-30 03:12:59 +00008909static PyObject *
8910unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8911{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008912 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008913 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008914
8915 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8916 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8917 if (tmp == NULL)
8918 return NULL;
8919 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008920 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008921 if (pnew == NULL) {
8922 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008923 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008924 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008925 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008926 if (pnew->str == NULL) {
8927 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008928 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008929 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008930 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008931 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008932 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8933 pnew->length = n;
8934 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008935 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008936 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008937}
8938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008939PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008940"unicode(string [, encoding[, errors]]) -> object\n\
8941\n\
8942Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008943encoding defaults to the current default string encoding.\n\
8944errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008947 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 "unicode", /* tp_name */
8949 sizeof(PyUnicodeObject), /* tp_size */
8950 0, /* tp_itemsize */
8951 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008952 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008954 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008956 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008957 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008958 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008960 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 (hashfunc) unicode_hash, /* tp_hash*/
8962 0, /* tp_call*/
8963 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008964 PyObject_GenericGetAttr, /* tp_getattro */
8965 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008967 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008968 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008969 unicode_doc, /* tp_doc */
8970 0, /* tp_traverse */
8971 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008972 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008973 0, /* tp_weaklistoffset */
8974 0, /* tp_iter */
8975 0, /* tp_iternext */
8976 unicode_methods, /* tp_methods */
8977 0, /* tp_members */
8978 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008979 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008980 0, /* tp_dict */
8981 0, /* tp_descr_get */
8982 0, /* tp_descr_set */
8983 0, /* tp_dictoffset */
8984 0, /* tp_init */
8985 0, /* tp_alloc */
8986 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008987 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988};
8989
8990/* Initialize the Unicode implementation */
8991
Thomas Wouters78890102000-07-22 19:25:51 +00008992void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008994 int i;
8995
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008996 /* XXX - move this array to unicodectype.c ? */
8997 Py_UNICODE linebreak[] = {
8998 0x000A, /* LINE FEED */
8999 0x000D, /* CARRIAGE RETURN */
9000 0x001C, /* FILE SEPARATOR */
9001 0x001D, /* GROUP SEPARATOR */
9002 0x001E, /* RECORD SEPARATOR */
9003 0x0085, /* NEXT LINE */
9004 0x2028, /* LINE SEPARATOR */
9005 0x2029, /* PARAGRAPH SEPARATOR */
9006 };
9007
Fred Drakee4315f52000-05-09 19:53:39 +00009008 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009009 free_list = NULL;
9010 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009012 if (!unicode_empty)
9013 return;
9014
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009015 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009016 for (i = 0; i < 256; i++)
9017 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009018 if (PyType_Ready(&PyUnicode_Type) < 0)
9019 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009020
9021 /* initialize the linebreak bloom filter */
9022 bloom_linebreak = make_bloom_mask(
9023 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9024 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009025
9026 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027}
9028
9029/* Finalize the Unicode implementation */
9030
Christian Heimes3b718a72008-02-14 12:47:33 +00009031int
9032PyUnicode_ClearFreeList(void)
9033{
9034 int freelist_size = numfree;
9035 PyUnicodeObject *u;
9036
9037 for (u = free_list; u != NULL;) {
9038 PyUnicodeObject *v = u;
9039 u = *(PyUnicodeObject **)u;
9040 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00009041 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00009042 Py_XDECREF(v->defenc);
9043 PyObject_Del(v);
9044 numfree--;
9045 }
9046 free_list = NULL;
9047 assert(numfree == 0);
9048 return freelist_size;
9049}
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051void
Thomas Wouters78890102000-07-22 19:25:51 +00009052_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009054 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009056 Py_XDECREF(unicode_empty);
9057 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009059 for (i = 0; i < 256; i++) {
9060 if (unicode_latin1[i]) {
9061 Py_DECREF(unicode_latin1[i]);
9062 unicode_latin1[i] = NULL;
9063 }
9064 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009065 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009067
Anthony Baxterac6bd462006-04-13 02:06:09 +00009068#ifdef __cplusplus
9069}
9070#endif
9071
9072
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009073/*
9074Local variables:
9075c-basic-offset: 4
9076indent-tabs-mode: nil
9077End:
9078*/