blob: 2f80e59b32fd84dbcaa741f108784d417470f0fd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson339f8c62009-01-31 22:25:08 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000247 if (unicode == unicode_empty ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
Georg Brandl6290bcf2010-08-01 21:48:47 +0000275 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 }
277 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return 0;
280}
281
282/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000283 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
285 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000286 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288*/
289
290static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000291PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292{
293 register PyUnicodeObject *unicode;
294
Andrew Dalkee0df7622006-05-27 11:04:36 +0000295 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 if (length == 0 && unicode_empty != NULL) {
297 Py_INCREF(unicode_empty);
298 return unicode_empty;
299 }
300
Neal Norwitze7d8be82008-07-31 17:17:14 +0000301 /* Ensure we won't overflow the size. */
302 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
303 return (PyUnicodeObject *)PyErr_NoMemory();
304 }
305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000307 if (free_list) {
308 unicode = free_list;
309 free_list = *(PyUnicodeObject **)unicode;
310 numfree--;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000311 if (unicode->str) {
312 /* Keep-Alive optimization: we only upsize the buffer,
313 never downsize it. */
314 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000315 unicode_resize(unicode, length) < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000316 PyObject_DEL(unicode->str);
317 unicode->str = NULL;
318 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000319 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000320 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000321 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
322 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000323 }
324 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 }
326 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000327 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000328 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 if (unicode == NULL)
330 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000331 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000335 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000336 PyErr_NoMemory();
337 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000338 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000339 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000340 * the caller fails before initializing str -- unicode_resize()
341 * reads str[0], and the Keep-Alive optimization can keep memory
342 * allocated for str alive across a call to unicode_dealloc(unicode).
343 * We don't want unicode_resize to read uninitialized memory in
344 * that case.
345 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000346 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000348 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000350 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000352
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000353 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000354 /* XXX UNREF/NEWREF interface should be more symmetrical */
355 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000356 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000357 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359}
360
361static
Guido van Rossum9475a232001-10-05 20:51:39 +0000362void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000364 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000365 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000366 /* Keep-Alive optimization */
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000367 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
368 PyObject_DEL(unicode->str);
369 unicode->str = NULL;
370 unicode->length = 0;
371 }
372 if (unicode->defenc) {
Georg Brandl6290bcf2010-08-01 21:48:47 +0000373 Py_CLEAR(unicode->defenc);
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000374 }
375 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000376 *(PyUnicodeObject **)unicode = free_list;
377 free_list = unicode;
378 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 }
380 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000381 PyObject_DEL(unicode->str);
382 Py_XDECREF(unicode->defenc);
383 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 }
385}
386
Benjamin Peterson828a7062008-12-27 17:05:29 +0000387static
388int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389{
390 register PyUnicodeObject *v;
391
392 /* Argument checks */
393 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000394 PyErr_BadInternalCall();
395 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000396 }
Benjamin Peterson828a7062008-12-27 17:05:29 +0000397 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000398 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000399 PyErr_BadInternalCall();
400 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000401 }
402
403 /* Resizing unicode_empty and single character objects is not
404 possible since these are being shared. We simply return a fresh
405 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000406 if (v->length != length &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000407 (v == unicode_empty || v->length == 1)) {
408 PyUnicodeObject *w = _PyUnicode_New(length);
409 if (w == NULL)
410 return -1;
411 Py_UNICODE_COPY(w->str, v->str,
412 length < v->length ? length : v->length);
413 Py_DECREF(*unicode);
414 *unicode = w;
415 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Note that we don't have to modify *unicode for unshared Unicode
419 objects, since we can modify them in-place. */
420 return unicode_resize(v, length);
421}
422
Benjamin Peterson828a7062008-12-27 17:05:29 +0000423int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
424{
425 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
426}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000429 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430{
431 PyUnicodeObject *unicode;
432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 /* If the Unicode data is known at construction time, we can apply
434 some optimizations which share commonly used objects. */
435 if (u != NULL) {
436
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000437 /* Optimization for empty strings */
438 if (size == 0 && unicode_empty != NULL) {
439 Py_INCREF(unicode_empty);
440 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000441 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000442
443 /* Single character Unicode objects in the Latin-1 range are
444 shared when using this constructor */
445 if (size == 1 && *u < 256) {
446 unicode = unicode_latin1[*u];
447 if (!unicode) {
448 unicode = _PyUnicode_New(1);
449 if (!unicode)
450 return NULL;
451 unicode->str[0] = *u;
452 unicode_latin1[*u] = unicode;
453 }
454 Py_INCREF(unicode);
455 return (PyObject *)unicode;
456 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000457 }
Tim Petersced69f82003-09-16 20:30:58 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 unicode = _PyUnicode_New(size);
460 if (!unicode)
461 return NULL;
462
463 /* Copy the Unicode data into the new object */
464 if (u != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000465 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000466
467 return (PyObject *)unicode;
468}
469
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000470PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
471{
472 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000473
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000474 if (size < 0) {
475 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000476 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000477 return NULL;
478 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000479
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects.
482 Also, this means the input must be UTF-8, so fall back to the
483 UTF-8 decoder at the end. */
484 if (u != NULL) {
485
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000490 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000491
492 /* Single characters are shared when using this constructor.
493 Restrict to ASCII, since the input must be UTF-8. */
494 if (size == 1 && Py_CHARMASK(*u) < 128) {
495 unicode = unicode_latin1[Py_CHARMASK(*u)];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = Py_CHARMASK(*u);
501 unicode_latin1[Py_CHARMASK(*u)] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000506
507 return PyUnicode_DecodeUTF8(u, size, NULL);
508 }
509
510 unicode = _PyUnicode_New(size);
511 if (!unicode)
512 return NULL;
513
514 return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromString(const char *u)
518{
519 size_t size = strlen(u);
520 if (size > PY_SSIZE_T_MAX) {
521 PyErr_SetString(PyExc_OverflowError, "input too long");
522 return NULL;
523 }
524
525 return PyUnicode_FromStringAndSize(u, size);
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528#ifdef HAVE_WCHAR_H
529
530PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000531 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532{
533 PyUnicodeObject *unicode;
534
535 if (w == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000536 PyErr_BadInternalCall();
537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538 }
539
540 unicode = _PyUnicode_New(size);
541 if (!unicode)
542 return NULL;
543
544 /* Copy the wchar_t data into the new object */
545#ifdef HAVE_USABLE_WCHAR_T
546 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000547#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000549 register Py_UNICODE *u;
550 register Py_ssize_t i;
551 u = PyUnicode_AS_UNICODE(unicode);
552 for (i = size; i > 0; i--)
553 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 }
555#endif
556
557 return (PyObject *)unicode;
558}
559
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000560static void
561makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
562{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000563 *fmt++ = '%';
564 if (width) {
565 if (zeropad)
566 *fmt++ = '0';
567 fmt += sprintf(fmt, "%d", width);
568 }
569 if (precision)
570 fmt += sprintf(fmt, ".%d", precision);
571 if (longflag)
572 *fmt++ = 'l';
573 else if (size_tflag) {
574 char *f = PY_FORMAT_SIZE_T;
575 while (*f)
576 *fmt++ = *f++;
577 }
578 *fmt++ = c;
579 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000580}
581
582#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
583
584PyObject *
585PyUnicode_FromFormatV(const char *format, va_list vargs)
586{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000587 va_list count;
588 Py_ssize_t callcount = 0;
589 PyObject **callresults = NULL;
590 PyObject **callresult = NULL;
591 Py_ssize_t n = 0;
592 int width = 0;
593 int precision = 0;
594 int zeropad;
595 const char* f;
596 Py_UNICODE *s;
597 PyObject *string;
598 /* used by sprintf */
599 char buffer[21];
600 /* use abuffer instead of buffer, if we need more space
601 * (which can happen if there's a format specifier with width). */
602 char *abuffer = NULL;
603 char *realbuffer;
604 Py_ssize_t abuffersize = 0;
605 char fmt[60]; /* should be enough for %0width.precisionld */
606 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000607
608#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000609 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000610#else
611#ifdef __va_copy
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000612 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000613#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000614 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#endif
616#endif
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000617 /* step 1: count the number of %S/%R/%s format specifications
618 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
619 * objects once during step 3 and put the result in an array) */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000620 for (f = format; *f; f++) {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000621 if (*f == '%') {
622 if (*(f+1)=='%')
623 continue;
Walter Dörwald67032252009-05-03 22:46:50 +0000624 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000625 ++callcount;
626 while (isdigit((unsigned)*f))
627 width = (width*10) + *f++ - '0';
628 while (*++f && *f != '%' && !isalpha((unsigned)*f))
629 ;
630 if (*f == 's')
631 ++callcount;
632 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000633 }
634 /* step 2: allocate memory for the results of
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000635 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000636 if (callcount) {
637 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
638 if (!callresults) {
639 PyErr_NoMemory();
640 return NULL;
641 }
642 callresult = callresults;
643 }
644 /* step 3: figure out how large a buffer we need */
645 for (f = format; *f; f++) {
646 if (*f == '%') {
647 const char* p = f;
648 width = 0;
649 while (isdigit((unsigned)*f))
650 width = (width*10) + *f++ - '0';
651 while (*++f && *f != '%' && !isalpha((unsigned)*f))
652 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000654 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
655 * they don't affect the amount of space we reserve.
656 */
657 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000658 (f[1] == 'd' || f[1] == 'u'))
659 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000661 switch (*f) {
662 case 'c':
663 (void)va_arg(count, int);
664 /* fall through... */
665 case '%':
666 n++;
667 break;
668 case 'd': case 'u': case 'i': case 'x':
669 (void) va_arg(count, int);
670 /* 20 bytes is enough to hold a 64-bit
671 integer. Decimal takes the most space.
672 This isn't enough for octal.
673 If a width is specified we need more
674 (which we allocate later). */
675 if (width < 20)
676 width = 20;
677 n += width;
678 if (abuffersize < width)
679 abuffersize = width;
680 break;
681 case 's':
682 {
683 /* UTF-8 */
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000684 unsigned char *s = va_arg(count, unsigned char*);
685 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
686 if (!str)
687 goto fail;
688 n += PyUnicode_GET_SIZE(str);
689 /* Remember the str and switch to the next slot */
690 *callresult++ = str;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000691 break;
692 }
693 case 'U':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 assert(obj && PyUnicode_Check(obj));
697 n += PyUnicode_GET_SIZE(obj);
698 break;
699 }
700 case 'V':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 const char *str = va_arg(count, const char *);
704 assert(obj || str);
705 assert(!obj || PyUnicode_Check(obj));
706 if (obj)
707 n += PyUnicode_GET_SIZE(obj);
708 else
709 n += strlen(str);
710 break;
711 }
712 case 'S':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 PyObject *str;
716 assert(obj);
717 str = PyObject_Str(obj);
718 if (!str)
719 goto fail;
720 n += PyUnicode_GET_SIZE(str);
721 /* Remember the str and switch to the next slot */
722 *callresult++ = str;
723 break;
724 }
725 case 'R':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 PyObject *repr;
729 assert(obj);
730 repr = PyObject_Repr(obj);
731 if (!repr)
732 goto fail;
733 n += PyUnicode_GET_SIZE(repr);
734 /* Remember the repr and switch to the next slot */
735 *callresult++ = repr;
736 break;
737 }
738 case 'p':
739 (void) va_arg(count, int);
740 /* maximum 64-bit pointer representation:
741 * 0xffffffffffffffff
742 * so 19 characters is enough.
743 * XXX I count 18 -- what's the extra for?
744 */
745 n += 19;
746 break;
747 default:
748 /* if we stumble upon an unknown
749 formatting code, copy the rest of
750 the format string to the output
751 string. (we cannot just skip the
752 code, since there's no way to know
753 what's in the argument list) */
754 n += strlen(p);
755 goto expand;
756 }
757 } else
758 n++;
759 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000760 expand:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000761 if (abuffersize > 20) {
762 abuffer = PyObject_Malloc(abuffersize);
763 if (!abuffer) {
764 PyErr_NoMemory();
765 goto fail;
766 }
767 realbuffer = abuffer;
768 }
769 else
770 realbuffer = buffer;
771 /* step 4: fill the buffer */
772 /* Since we've analyzed how much space we need for the worst case,
773 we don't have to resize the string.
774 There can be no errors beyond this point. */
775 string = PyUnicode_FromUnicode(NULL, n);
776 if (!string)
777 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000778
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000779 s = PyUnicode_AS_UNICODE(string);
780 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000781
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000782 for (f = format; *f; f++) {
783 if (*f == '%') {
784 const char* p = f++;
785 int longflag = 0;
786 int size_tflag = 0;
787 zeropad = (*f == '0');
788 /* parse the width.precision part */
789 width = 0;
790 while (isdigit((unsigned)*f))
791 width = (width*10) + *f++ - '0';
792 precision = 0;
793 if (*f == '.') {
794 f++;
795 while (isdigit((unsigned)*f))
796 precision = (precision*10) + *f++ - '0';
797 }
798 /* handle the long flag, but only for %ld and %lu.
799 others can be added when necessary. */
800 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
801 longflag = 1;
802 ++f;
803 }
804 /* handle the size_t flag. */
805 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
806 size_tflag = 1;
807 ++f;
808 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000809
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000810 switch (*f) {
811 case 'c':
812 *s++ = va_arg(vargs, int);
813 break;
814 case 'd':
815 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
816 if (longflag)
817 sprintf(realbuffer, fmt, va_arg(vargs, long));
818 else if (size_tflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
820 else
821 sprintf(realbuffer, fmt, va_arg(vargs, int));
822 appendstring(realbuffer);
823 break;
824 case 'u':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
832 appendstring(realbuffer);
833 break;
834 case 'i':
835 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
836 sprintf(realbuffer, fmt, va_arg(vargs, int));
837 appendstring(realbuffer);
838 break;
839 case 'x':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 's':
845 {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000846 /* unused, since we already have the result */
847 (void) va_arg(vargs, char *);
848 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
849 PyUnicode_GET_SIZE(*callresult));
850 s += PyUnicode_GET_SIZE(*callresult);
851 /* We're done with the unicode()/repr() => forget it */
852 Py_DECREF(*callresult);
853 /* switch to next unicode()/repr() result */
854 ++callresult;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000855 break;
856 }
857 case 'U':
858 {
859 PyObject *obj = va_arg(vargs, PyObject *);
860 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
861 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
862 s += size;
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(vargs, PyObject *);
868 const char *str = va_arg(vargs, const char *);
869 if (obj) {
870 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
871 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
872 s += size;
873 } else {
874 appendstring(str);
875 }
876 break;
877 }
878 case 'S':
879 case 'R':
880 {
881 Py_UNICODE *ucopy;
882 Py_ssize_t usize;
883 Py_ssize_t upos;
884 /* unused, since we already have the result */
885 (void) va_arg(vargs, PyObject *);
886 ucopy = PyUnicode_AS_UNICODE(*callresult);
887 usize = PyUnicode_GET_SIZE(*callresult);
888 for (upos = 0; upos<usize;)
889 *s++ = ucopy[upos++];
890 /* We're done with the unicode()/repr() => forget it */
891 Py_DECREF(*callresult);
892 /* switch to next unicode()/repr() result */
893 ++callresult;
894 break;
895 }
896 case 'p':
897 sprintf(buffer, "%p", va_arg(vargs, void*));
898 /* %p is ill-defined: ensure leading 0x. */
899 if (buffer[1] == 'X')
900 buffer[1] = 'x';
901 else if (buffer[1] != 'x') {
902 memmove(buffer+2, buffer, strlen(buffer)+1);
903 buffer[0] = '0';
904 buffer[1] = 'x';
905 }
906 appendstring(buffer);
907 break;
908 case '%':
909 *s++ = '%';
910 break;
911 default:
912 appendstring(p);
913 goto end;
914 }
915 } else
916 *s++ = *f;
917 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000918
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000919 end:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000920 if (callresults)
921 PyObject_Free(callresults);
922 if (abuffer)
923 PyObject_Free(abuffer);
924 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
925 return string;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000926 fail:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000927 if (callresults) {
928 PyObject **callresult2 = callresults;
929 while (callresult2 < callresult) {
930 Py_DECREF(*callresult2);
931 ++callresult2;
932 }
933 PyObject_Free(callresults);
934 }
935 if (abuffer)
936 PyObject_Free(abuffer);
937 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938}
939
940#undef appendstring
941
942PyObject *
943PyUnicode_FromFormat(const char *format, ...)
944{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000945 PyObject* ret;
946 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000947
948#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000949 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000950#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000951 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000953 ret = PyUnicode_FromFormatV(format, vargs);
954 va_end(vargs);
955 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000956}
957
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000959 wchar_t *w,
960 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961{
962 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000963 PyErr_BadInternalCall();
964 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000966
967 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000968 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000969 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971#ifdef HAVE_USABLE_WCHAR_T
972 memcpy(w, unicode->str, size * sizeof(wchar_t));
973#else
974 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000975 register Py_UNICODE *u;
976 register Py_ssize_t i;
977 u = PyUnicode_AS_UNICODE(unicode);
978 for (i = size; i > 0; i--)
979 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 }
981#endif
982
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000983 if (size > PyUnicode_GET_SIZE(unicode))
984 return PyUnicode_GET_SIZE(unicode);
985 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000986 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987}
988
989#endif
990
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000991PyObject *PyUnicode_FromOrdinal(int ordinal)
992{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000993 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000994
995#ifdef Py_UNICODE_WIDE
996 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000997 PyErr_SetString(PyExc_ValueError,
998 "unichr() arg not in range(0x110000) "
999 "(wide Python build)");
1000 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001001 }
1002#else
1003 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001004 PyErr_SetString(PyExc_ValueError,
1005 "unichr() arg not in range(0x10000) "
1006 "(narrow Python build)");
1007 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001008 }
1009#endif
1010
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 s[0] = (Py_UNICODE)ordinal;
1012 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001013}
1014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015PyObject *PyUnicode_FromObject(register PyObject *obj)
1016{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001018 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001019 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001020 Py_INCREF(obj);
1021 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001022 }
1023 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001024 /* For a Unicode subtype that's not a Unicode object,
1025 return a true Unicode object with the same data. */
1026 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1027 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001029 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1030}
1031
1032PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001033 const char *encoding,
1034 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001035{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001039
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (obj == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001041 PyErr_BadInternalCall();
1042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001045#if 0
1046 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001047 that no encodings is given and then redirect to
1048 PyObject_Unicode() which then applies the additional logic for
1049 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001050
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001051 NOTE: This API should really only be used for object which
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001052 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001053
1054 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001055 if (PyUnicode_Check(obj)) {
1056 if (encoding) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001057 PyErr_SetString(PyExc_TypeError,
1058 "decoding Unicode is not supported");
1059 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001060 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001061 return PyObject_Unicode(obj);
1062 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#else
1064 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001065 PyErr_SetString(PyExc_TypeError,
1066 "decoding Unicode is not supported");
1067 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069#endif
1070
1071 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001072 if (PyString_Check(obj)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001073 s = PyString_AS_STRING(obj);
1074 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001075 }
Christian Heimes3497f942008-05-26 12:29:14 +00001076 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001077 /* Python 2.x specific */
1078 PyErr_Format(PyExc_TypeError,
1079 "decoding bytearray is not supported");
1080 return NULL;
1081 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001082 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001083 /* Overwrite the error message with something more useful in
1084 case of a TypeError. */
1085 if (PyErr_ExceptionMatches(PyExc_TypeError))
1086 PyErr_Format(PyExc_TypeError,
1087 "coercing to Unicode: need string or buffer, "
1088 "%.80s found",
1089 Py_TYPE(obj)->tp_name);
1090 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 if (len == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001095 Py_INCREF(unicode_empty);
1096 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 }
Tim Petersced69f82003-09-16 20:30:58 +00001098 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001099 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001100
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001101 return v;
1102
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001103 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105}
1106
1107PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001108 Py_ssize_t size,
1109 const char *encoding,
1110 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111{
1112 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001113
1114 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001115 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001116
1117 /* Shortcuts for common default encodings */
1118 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001120 else if (strcmp(encoding, "latin-1") == 0)
1121 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001122#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1123 else if (strcmp(encoding, "mbcs") == 0)
1124 return PyUnicode_DecodeMBCS(s, size, errors);
1125#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001126 else if (strcmp(encoding, "ascii") == 0)
1127 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128
1129 /* Decode via the codec registry */
1130 buffer = PyBuffer_FromMemory((void *)s, size);
1131 if (buffer == NULL)
1132 goto onError;
1133 unicode = PyCodec_Decode(buffer, encoding, errors);
1134 if (unicode == NULL)
1135 goto onError;
1136 if (!PyUnicode_Check(unicode)) {
1137 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001138 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001139 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 Py_DECREF(unicode);
1141 goto onError;
1142 }
1143 Py_DECREF(buffer);
1144 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001145
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 Py_XDECREF(buffer);
1148 return NULL;
1149}
1150
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001151PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1152 const char *encoding,
1153 const char *errors)
1154{
1155 PyObject *v;
1156
1157 if (!PyUnicode_Check(unicode)) {
1158 PyErr_BadArgument();
1159 goto onError;
1160 }
1161
1162 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001163 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001164
1165 /* Decode via the codec registry */
1166 v = PyCodec_Decode(unicode, encoding, errors);
1167 if (v == NULL)
1168 goto onError;
1169 return v;
1170
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001171 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001172 return NULL;
1173}
1174
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001176 Py_ssize_t size,
1177 const char *encoding,
1178 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179{
1180 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 unicode = PyUnicode_FromUnicode(s, size);
1183 if (unicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1186 Py_DECREF(unicode);
1187 return v;
1188}
1189
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001190PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v;
1195
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_BadArgument();
1198 goto onError;
1199 }
1200
1201 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001202 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001203
1204 /* Encode via the codec registry */
1205 v = PyCodec_Encode(unicode, encoding, errors);
1206 if (v == NULL)
1207 goto onError;
1208 return v;
1209
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001210 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001211 return NULL;
1212}
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1215 const char *encoding,
1216 const char *errors)
1217{
1218 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001219
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 if (!PyUnicode_Check(unicode)) {
1221 PyErr_BadArgument();
1222 goto onError;
1223 }
Fred Drakee4315f52000-05-09 19:53:39 +00001224
Tim Petersced69f82003-09-16 20:30:58 +00001225 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001226 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001227
1228 /* Shortcuts for common default encodings */
1229 if (errors == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001230 if (strcmp(encoding, "utf-8") == 0)
1231 return PyUnicode_AsUTF8String(unicode);
1232 else if (strcmp(encoding, "latin-1") == 0)
1233 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001234#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001235 else if (strcmp(encoding, "mbcs") == 0)
1236 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001237#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001238 else if (strcmp(encoding, "ascii") == 0)
1239 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Encode via the codec registry */
1243 v = PyCodec_Encode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001246 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001248 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001249 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 Py_DECREF(v);
1251 goto onError;
1252 }
1253 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001255 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 return NULL;
1257}
1258
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001259PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001260 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001261{
1262 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1263
1264 if (v)
1265 return v;
1266 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1267 if (v && errors == NULL)
1268 ((PyUnicodeObject *)unicode)->defenc = v;
1269 return v;
1270}
1271
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1273{
1274 if (!PyUnicode_Check(unicode)) {
1275 PyErr_BadArgument();
1276 goto onError;
1277 }
1278 return PyUnicode_AS_UNICODE(unicode);
1279
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001280 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 return NULL;
1282}
1283
Martin v. Löwis18e16552006-02-15 17:27:45 +00001284Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285{
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290 return PyUnicode_GET_SIZE(unicode);
1291
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 return -1;
1294}
1295
Thomas Wouters78890102000-07-22 19:25:51 +00001296const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001297{
1298 return unicode_default_encoding;
1299}
1300
1301int PyUnicode_SetDefaultEncoding(const char *encoding)
1302{
1303 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001304
Fred Drakee4315f52000-05-09 19:53:39 +00001305 /* Make sure the encoding is valid. As side effect, this also
1306 loads the encoding into the codec registry cache. */
1307 v = _PyCodec_Lookup(encoding);
1308 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001309 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001310 Py_DECREF(v);
1311 strncpy(unicode_default_encoding,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001312 encoding,
1313 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001314 return 0;
1315
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001316 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001317 return -1;
1318}
1319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001320/* error handling callback helper:
1321 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001322 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323 and adjust various state variables.
1324 return 0 on success, -1 on error
1325*/
1326
1327static
1328int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001329 const char *encoding, const char *reason,
1330 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1331 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1332 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001334 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335
1336 PyObject *restuple = NULL;
1337 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001338 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1339 Py_ssize_t requiredsize;
1340 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001342 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 int res = -1;
1344
1345 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001346 *errorHandler = PyCodec_LookupError(errors);
1347 if (*errorHandler == NULL)
1348 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 }
1350
1351 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001352 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001353 encoding, input, insize, *startinpos, *endinpos, reason);
1354 if (*exceptionObject == NULL)
1355 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356 }
1357 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001358 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1359 goto onError;
1360 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1361 goto onError;
1362 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1363 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 }
1365
1366 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1367 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001368 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00001370 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001372 }
1373 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 if (newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001376 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001377 if (newpos<0 || newpos>insize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001378 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1379 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381
1382 /* need more space? (at least enough for what we
1383 have+the replacement+the rest of the string (starting
1384 at the new input position), so we won't have to check space
1385 when there are no errors in the rest of the string) */
1386 repptr = PyUnicode_AS_UNICODE(repunicode);
1387 repsize = PyUnicode_GET_SIZE(repunicode);
1388 requiredsize = *outpos + repsize + insize-newpos;
1389 if (requiredsize > outsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001390 if (requiredsize<2*outsize)
1391 requiredsize = 2*outsize;
1392 if (_PyUnicode_Resize(output, requiredsize) < 0)
1393 goto onError;
1394 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 }
1396 *endinpos = newpos;
1397 *inptr = input + newpos;
1398 Py_UNICODE_COPY(*outptr, repptr, repsize);
1399 *outptr += repsize;
1400 *outpos += repsize;
1401 /* we made it! */
1402 res = 0;
1403
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001404 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001405 Py_XDECREF(restuple);
1406 return res;
1407}
1408
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001409/* --- UTF-7 Codec -------------------------------------------------------- */
1410
1411/* see RFC2152 for details */
1412
Tim Petersced69f82003-09-16 20:30:58 +00001413static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001414char utf7_special[128] = {
1415 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1416 encoded:
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001417 0 - not special
1418 1 - special
1419 2 - whitespace (optional)
1420 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1422 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1423 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1425 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1429
1430};
1431
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001432/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1433 warnings about the comparison always being false; since
1434 utf7_special[0] is 1, we can safely make that one comparison
1435 true */
1436
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001437#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001438 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001439 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440 (encodeO && (utf7_special[(c)] == 3)))
1441
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001442#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001443 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001444#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001445 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001446#define UB64(c) \
1447 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001448 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001450#define ENCODE(out, ch, bits) \
1451 while (bits >= 6) { \
1452 *out++ = B64(ch >> (bits-6)); \
1453 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 }
1455
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001456#define DECODE(out, ch, bits, surrogate) \
1457 while (bits >= 16) { \
1458 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1459 bits -= 16; \
1460 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001461 /* We have already generated an error for the high surrogate \
1462 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001463 surrogate = 0; \
1464 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001466 it in a 16-bit character */ \
1467 surrogate = 1; \
1468 errmsg = "code pairs are not supported"; \
1469 goto utf7Error; \
1470 } else { \
1471 *out++ = outCh; \
1472 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001475PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001476 Py_ssize_t size,
1477 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001479 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1480}
1481
1482PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001483 Py_ssize_t size,
1484 const char *errors,
1485 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001488 Py_ssize_t startinpos;
1489 Py_ssize_t endinpos;
1490 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491 const char *e;
1492 PyUnicodeObject *unicode;
1493 Py_UNICODE *p;
1494 const char *errmsg = "";
1495 int inShift = 0;
1496 unsigned int bitsleft = 0;
1497 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 int surrogate = 0;
1499 PyObject *errorHandler = NULL;
1500 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001501
1502 unicode = _PyUnicode_New(size);
1503 if (!unicode)
1504 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001505 if (size == 0) {
1506 if (consumed)
1507 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001509 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510
1511 p = unicode->str;
1512 e = s + size;
1513
1514 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001515 Py_UNICODE ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001516 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001517 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518
1519 if (inShift) {
1520 if ((ch == '-') || !B64CHAR(ch)) {
1521 inShift = 0;
1522 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001523
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1525 if (bitsleft >= 6) {
1526 /* The shift sequence has a partial character in it. If
1527 bitsleft < 6 then we could just classify it as padding
1528 but that is not the case here */
1529
1530 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001531 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 }
1533 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001534 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 here so indicate the potential of a misencoded character. */
1536
1537 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1538 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1539 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001540 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 }
1542
1543 if (ch == '-') {
1544 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001545 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 inShift = 1;
1547 }
1548 } else if (SPECIAL(ch,0,0)) {
1549 errmsg = "unexpected special character";
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001550 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 } else {
1552 *p++ = ch;
1553 }
1554 } else {
1555 charsleft = (charsleft << 6) | UB64(ch);
1556 bitsleft += 6;
1557 s++;
1558 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1559 }
1560 }
1561 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001562 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 s++;
1564 if (s < e && *s == '-') {
1565 s++;
1566 *p++ = '+';
1567 } else
1568 {
1569 inShift = 1;
1570 bitsleft = 0;
1571 }
1572 }
1573 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001574 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 errmsg = "unexpected special character";
1576 s++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001577 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 }
1579 else {
1580 *p++ = ch;
1581 s++;
1582 }
1583 continue;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001584 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 outpos = p-PyUnicode_AS_UNICODE(unicode);
1586 endinpos = s-starts;
1587 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001588 errors, &errorHandler,
1589 "utf7", errmsg,
1590 starts, size, &startinpos, &endinpos, &exc, &s,
1591 &unicode, &outpos, &p))
1592 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 }
1594
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001595 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 outpos = p-PyUnicode_AS_UNICODE(unicode);
1597 endinpos = size;
1598 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001599 errors, &errorHandler,
1600 "utf7", "unterminated shift sequence",
1601 starts, size, &startinpos, &endinpos, &exc, &s,
1602 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 if (s < e)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001605 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001607 if (consumed) {
1608 if(inShift)
1609 *consumed = startinpos;
1610 else
1611 *consumed = s-starts;
1612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001614 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615 goto onError;
1616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_XDECREF(errorHandler);
1618 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 return (PyObject *)unicode;
1620
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 Py_XDECREF(errorHandler);
1623 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 Py_DECREF(unicode);
1625 return NULL;
1626}
1627
1628
1629PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001630 Py_ssize_t size,
1631 int encodeSetO,
1632 int encodeWhiteSpace,
1633 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634{
1635 PyObject *v;
1636 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001637 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 unsigned int bitsleft = 0;
1641 unsigned long charsleft = 0;
1642 char * out;
1643 char * start;
1644
Neal Norwitze7d8be82008-07-31 17:17:14 +00001645 if (cbAllocated / 5 != size)
1646 return PyErr_NoMemory();
1647
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 if (size == 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001649 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001651 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 if (v == NULL)
1653 return NULL;
1654
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001655 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 for (;i < size; ++i) {
1657 Py_UNICODE ch = s[i];
1658
1659 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001660 if (ch == '+') {
1661 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 *out++ = '-';
1663 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1664 charsleft = ch;
1665 bitsleft = 16;
1666 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001667 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 } else {
1670 *out++ = (char) ch;
1671 }
1672 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1674 *out++ = B64(charsleft << (6-bitsleft));
1675 charsleft = 0;
1676 bitsleft = 0;
1677 /* Characters not in the BASE64 set implicitly unshift the sequence
1678 so no '-' is required, except if the character is itself a '-' */
1679 if (B64CHAR(ch) || ch == '-') {
1680 *out++ = '-';
1681 }
1682 inShift = 0;
1683 *out++ = (char) ch;
1684 } else {
1685 bitsleft += 16;
1686 charsleft = (charsleft << 16) | ch;
1687 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1688
Jesus Cea585ad8a2009-07-02 15:37:21 +00001689 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001690 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 or '-' then the shift sequence will be terminated implicitly and we
1692 don't have to insert a '-'. */
1693
1694 if (bitsleft == 0) {
1695 if (i + 1 < size) {
1696 Py_UNICODE ch2 = s[i+1];
1697
1698 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001699
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 } else if (B64CHAR(ch2) || ch2 == '-') {
1701 *out++ = '-';
1702 inShift = 0;
1703 } else {
1704 inShift = 0;
1705 }
1706
1707 }
1708 else {
1709 *out++ = '-';
1710 inShift = 0;
1711 }
1712 }
Tim Petersced69f82003-09-16 20:30:58 +00001713 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 if (bitsleft) {
1717 *out++= B64(charsleft << (6-bitsleft) );
1718 *out++ = '-';
1719 }
1720
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001721 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 return v;
1723}
1724
1725#undef SPECIAL
1726#undef B64
1727#undef B64CHAR
1728#undef UB64
1729#undef ENCODE
1730#undef DECODE
1731
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732/* --- UTF-8 Codec -------------------------------------------------------- */
1733
Tim Petersced69f82003-09-16 20:30:58 +00001734static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735char utf8_code_length[256] = {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001736 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1737 illegal prefix. See RFC 3629 for details */
1738 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1750 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1751 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1752 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1753 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754};
1755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001757 Py_ssize_t size,
1758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759{
Walter Dörwald69652032004-09-07 20:24:22 +00001760 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1761}
1762
1763PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001764 Py_ssize_t size,
1765 const char *errors,
1766 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 int n;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001770 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771 Py_ssize_t startinpos;
1772 Py_ssize_t endinpos;
1773 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 const char *e;
1775 PyUnicodeObject *unicode;
1776 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 PyObject *errorHandler = NULL;
1779 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
1781 /* Note: size will always be longer than the resulting Unicode
1782 character count */
1783 unicode = _PyUnicode_New(size);
1784 if (!unicode)
1785 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001786 if (size == 0) {
1787 if (consumed)
1788 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 /* Unpack UTF-8 encoded data */
1793 p = unicode->str;
1794 e = s + size;
1795
1796 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001797 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
1799 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001800 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 s++;
1802 continue;
1803 }
1804
1805 n = utf8_code_length[ch];
1806
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 if (s + n > e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001808 if (consumed)
1809 break;
1810 else {
1811 errmsg = "unexpected end of data";
1812 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001813 endinpos = startinpos+1;
1814 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1815 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001816 goto utf8Error;
1817 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 switch (n) {
1821
1822 case 0:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001823 errmsg = "invalid start byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001824 startinpos = s-starts;
1825 endinpos = startinpos+1;
1826 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 errmsg = "internal error";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001830 startinpos = s-starts;
1831 endinpos = startinpos+1;
1832 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833
1834 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001835 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001836 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001837 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001838 endinpos = startinpos + 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001839 goto utf8Error;
1840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001842 assert ((ch > 0x007F) && (ch <= 0x07FF));
1843 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 break;
1845
1846 case 3:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001847 /* XXX: surrogates shouldn't be valid UTF-8!
1848 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1850 Uncomment the 2 lines below to make them invalid,
1851 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001852 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001853 (s[2] & 0xc0) != 0x80 ||
1854 ((unsigned char)s[0] == 0xE0 &&
1855 (unsigned char)s[1] < 0xA0)/* ||
1856 ((unsigned char)s[0] == 0xED &&
1857 (unsigned char)s[1] > 0x9F)*/) {
1858 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001859 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001860 endinpos = startinpos + 1;
1861
1862 /* if s[1] first two bits are 1 and 0, then the invalid
1863 continuation byte is s[2], so increment endinpos by 1,
1864 if not, s[1] is invalid and endinpos doesn't need to
1865 be incremented. */
1866 if ((s[1] & 0xC0) == 0x80)
1867 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001868 goto utf8Error;
1869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001871 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1872 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001873 break;
1874
1875 case 4:
1876 if ((s[1] & 0xc0) != 0x80 ||
1877 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001878 (s[3] & 0xc0) != 0x80 ||
1879 ((unsigned char)s[0] == 0xF0 &&
1880 (unsigned char)s[1] < 0x90) ||
1881 ((unsigned char)s[0] == 0xF4 &&
1882 (unsigned char)s[1] > 0x8F)) {
1883 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001884 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001885 endinpos = startinpos + 1;
1886 if ((s[1] & 0xC0) == 0x80) {
1887 endinpos++;
1888 if ((s[2] & 0xC0) == 0x80)
1889 endinpos++;
1890 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001891 goto utf8Error;
1892 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti86e5e172010-07-03 05:34:39 +00001894 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1895 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
1896
Fredrik Lundh8f455852001-06-27 18:59:43 +00001897#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001898 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001901
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 /* translate from 10000..10FFFF to 0..FFFF */
1903 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001904
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001905 /* high surrogate = top 10 bits added to D800 */
1906 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001907
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001909 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 }
1913 s += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001914 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001915
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001916 utf8Error:
1917 outpos = p-PyUnicode_AS_UNICODE(unicode);
1918 if (unicode_decode_call_errorhandler(
1919 errors, &errorHandler,
1920 "utf8", errmsg,
1921 starts, size, &startinpos, &endinpos, &exc, &s,
1922 &unicode, &outpos, &p))
1923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 }
Walter Dörwald69652032004-09-07 20:24:22 +00001925 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001926 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
1928 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001929 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 goto onError;
1931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 Py_XDECREF(errorHandler);
1933 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 return (PyObject *)unicode;
1935
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001936 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 Py_DECREF(unicode);
1940 return NULL;
1941}
1942
Tim Peters602f7402002-04-27 18:03:26 +00001943/* Allocation strategy: if the string is short, convert into a stack buffer
1944 and allocate exactly as much space needed at the end. Else allocate the
1945 maximum possible needed (4 result bytes per Unicode character), and return
1946 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001947*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001948PyObject *
1949PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001950 Py_ssize_t size,
1951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952{
Tim Peters602f7402002-04-27 18:03:26 +00001953#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001954
Martin v. Löwis18e16552006-02-15 17:27:45 +00001955 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001956 PyObject *v; /* result string object */
1957 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001958 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001959 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001960 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001961
Tim Peters602f7402002-04-27 18:03:26 +00001962 assert(s != NULL);
1963 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964
Tim Peters602f7402002-04-27 18:03:26 +00001965 if (size <= MAX_SHORT_UNICHARS) {
1966 /* Write into the stack buffer; nallocated can't overflow.
1967 * At the end, we'll allocate exactly as much heap space as it
1968 * turns out we need.
1969 */
1970 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1971 v = NULL; /* will allocate after we're done */
1972 p = stackbuf;
1973 }
1974 else {
1975 /* Overallocate on the heap, and give the excess back at the end. */
1976 nallocated = size * 4;
1977 if (nallocated / 4 != size) /* overflow! */
1978 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001979 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001980 if (v == NULL)
1981 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001982 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001983 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001984
Tim Peters602f7402002-04-27 18:03:26 +00001985 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001986 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001987
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001988 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001989 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001993 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001994 *p++ = (char)(0xc0 | (ch >> 6));
1995 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001996 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001997 else {
Tim Peters602f7402002-04-27 18:03:26 +00001998 /* Encode UCS2 Unicode ordinals */
1999 if (ch < 0x10000) {
2000 /* Special case: check for high surrogate */
2001 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2002 Py_UCS4 ch2 = s[i];
2003 /* Check for low surrogate and combine the two to
2004 form a UCS4 value */
2005 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002006 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002007 i++;
2008 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 }
Tim Peters602f7402002-04-27 18:03:26 +00002010 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002013 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2014 *p++ = (char)(0x80 | (ch & 0x3f));
2015 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002016 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002017 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002018 /* Encode UCS4 Unicode ordinals */
2019 *p++ = (char)(0xf0 | (ch >> 18));
2020 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2021 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2022 *p++ = (char)(0x80 | (ch & 0x3f));
2023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002025
Tim Peters602f7402002-04-27 18:03:26 +00002026 if (v == NULL) {
2027 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002028 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002029 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002030 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002031 }
2032 else {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002033 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002034 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002035 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002036 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002039
Tim Peters602f7402002-04-27 18:03:26 +00002040#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041}
2042
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2044{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (!PyUnicode_Check(unicode)) {
2046 PyErr_BadArgument();
2047 return NULL;
2048 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002049 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002050 PyUnicode_GET_SIZE(unicode),
2051 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Walter Dörwald6e390802007-08-17 16:41:28 +00002054/* --- UTF-32 Codec ------------------------------------------------------- */
2055
2056PyObject *
2057PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002058 Py_ssize_t size,
2059 const char *errors,
2060 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002061{
2062 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2063}
2064
2065PyObject *
2066PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002067 Py_ssize_t size,
2068 const char *errors,
2069 int *byteorder,
2070 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002071{
2072 const char *starts = s;
2073 Py_ssize_t startinpos;
2074 Py_ssize_t endinpos;
2075 Py_ssize_t outpos;
2076 PyUnicodeObject *unicode;
2077 Py_UNICODE *p;
2078#ifndef Py_UNICODE_WIDE
Antoine Pitrou4595e512010-06-11 21:48:02 +00002079 int pairs = 0;
Walter Dörwald6e390802007-08-17 16:41:28 +00002080#else
2081 const int pairs = 0;
2082#endif
Antoine Pitrou4595e512010-06-11 21:48:02 +00002083 const unsigned char *q, *e, *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002084 int bo = 0; /* assume native ordering by default */
2085 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002086 /* Offsets from q for retrieving bytes in the right order. */
2087#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2088 int iorder[] = {0, 1, 2, 3};
2089#else
2090 int iorder[] = {3, 2, 1, 0};
2091#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002092 PyObject *errorHandler = NULL;
2093 PyObject *exc = NULL;
Antoine Pitrou4595e512010-06-11 21:48:02 +00002094
Walter Dörwald6e390802007-08-17 16:41:28 +00002095 q = (unsigned char *)s;
2096 e = q + size;
2097
2098 if (byteorder)
2099 bo = *byteorder;
2100
2101 /* Check for BOM marks (U+FEFF) in the input and adjust current
2102 byte order setting accordingly. In native mode, the leading BOM
2103 mark is skipped, in all other modes, it is copied to the output
2104 stream as-is (giving a ZWNBSP character). */
2105 if (bo == 0) {
2106 if (size >= 4) {
2107 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002108 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002110 if (bom == 0x0000FEFF) {
2111 q += 4;
2112 bo = -1;
2113 }
2114 else if (bom == 0xFFFE0000) {
2115 q += 4;
2116 bo = 1;
2117 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002118#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002119 if (bom == 0x0000FEFF) {
2120 q += 4;
2121 bo = 1;
2122 }
2123 else if (bom == 0xFFFE0000) {
2124 q += 4;
2125 bo = -1;
2126 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002127#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002128 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002129 }
2130
2131 if (bo == -1) {
2132 /* force LE */
2133 iorder[0] = 0;
2134 iorder[1] = 1;
2135 iorder[2] = 2;
2136 iorder[3] = 3;
2137 }
2138 else if (bo == 1) {
2139 /* force BE */
2140 iorder[0] = 3;
2141 iorder[1] = 2;
2142 iorder[2] = 1;
2143 iorder[3] = 0;
2144 }
2145
Antoine Pitrou4595e512010-06-11 21:48:02 +00002146 /* On narrow builds we split characters outside the BMP into two
2147 codepoints => count how much extra space we need. */
2148#ifndef Py_UNICODE_WIDE
2149 for (qq = q; qq < e; qq += 4)
2150 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2151 pairs++;
2152#endif
2153
2154 /* This might be one to much, because of a BOM */
2155 unicode = _PyUnicode_New((size+3)/4+pairs);
2156 if (!unicode)
2157 return NULL;
2158 if (size == 0)
2159 return (PyObject *)unicode;
2160
2161 /* Unpack UTF-32 encoded data */
2162 p = unicode->str;
2163
Walter Dörwald6e390802007-08-17 16:41:28 +00002164 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002165 Py_UCS4 ch;
2166 /* remaining bytes at the end? (size should be divisible by 4) */
2167 if (e-q<4) {
2168 if (consumed)
2169 break;
2170 errmsg = "truncated data";
2171 startinpos = ((const char *)q)-starts;
2172 endinpos = ((const char *)e)-starts;
2173 goto utf32Error;
2174 /* The remaining input chars are ignored if the callback
2175 chooses to skip the input */
2176 }
2177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2178 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002179
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002180 if (ch >= 0x110000)
2181 {
2182 errmsg = "codepoint not in range(0x110000)";
2183 startinpos = ((const char *)q)-starts;
2184 endinpos = startinpos+4;
2185 goto utf32Error;
2186 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002187#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002188 if (ch >= 0x10000)
2189 {
2190 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2191 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2192 }
2193 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002194#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002195 *p++ = ch;
2196 q += 4;
2197 continue;
2198 utf32Error:
2199 outpos = p-PyUnicode_AS_UNICODE(unicode);
2200 if (unicode_decode_call_errorhandler(
2201 errors, &errorHandler,
2202 "utf32", errmsg,
Georg Brandlf7a09be2009-09-17 11:33:31 +00002203 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002204 &unicode, &outpos, &p))
2205 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002206 }
2207
2208 if (byteorder)
2209 *byteorder = bo;
2210
2211 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002212 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002213
2214 /* Adjust length */
2215 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2216 goto onError;
2217
2218 Py_XDECREF(errorHandler);
2219 Py_XDECREF(exc);
2220 return (PyObject *)unicode;
2221
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002222 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002223 Py_DECREF(unicode);
2224 Py_XDECREF(errorHandler);
2225 Py_XDECREF(exc);
2226 return NULL;
2227}
2228
2229PyObject *
2230PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002231 Py_ssize_t size,
2232 const char *errors,
2233 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002234{
2235 PyObject *v;
2236 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002237 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002238#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002239 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002240#else
2241 const int pairs = 0;
2242#endif
2243 /* Offsets from p for storing byte pairs in the right order. */
2244#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2245 int iorder[] = {0, 1, 2, 3};
2246#else
2247 int iorder[] = {3, 2, 1, 0};
2248#endif
2249
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002250#define STORECHAR(CH) \
2251 do { \
2252 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2253 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2254 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2255 p[iorder[0]] = (CH) & 0xff; \
2256 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002257 } while(0)
2258
2259 /* In narrow builds we can output surrogate pairs as one codepoint,
2260 so we need less space. */
2261#ifndef Py_UNICODE_WIDE
2262 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002263 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2264 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2265 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002266#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002267 nsize = (size - pairs + (byteorder == 0));
2268 bytesize = nsize * 4;
2269 if (bytesize / 4 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002270 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002271 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002272 if (v == NULL)
2273 return NULL;
2274
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002275 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002276 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002277 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 if (size == 0)
2279 return v;
2280
2281 if (byteorder == -1) {
2282 /* force LE */
2283 iorder[0] = 0;
2284 iorder[1] = 1;
2285 iorder[2] = 2;
2286 iorder[3] = 3;
2287 }
2288 else if (byteorder == 1) {
2289 /* force BE */
2290 iorder[0] = 3;
2291 iorder[1] = 2;
2292 iorder[2] = 1;
2293 iorder[3] = 0;
2294 }
2295
2296 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002297 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002298#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002299 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2300 Py_UCS4 ch2 = *s;
2301 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2302 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2303 s++;
2304 size--;
2305 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002306 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002307#endif
2308 STORECHAR(ch);
2309 }
2310 return v;
2311#undef STORECHAR
2312}
2313
2314PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2315{
2316 if (!PyUnicode_Check(unicode)) {
2317 PyErr_BadArgument();
2318 return NULL;
2319 }
2320 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002321 PyUnicode_GET_SIZE(unicode),
2322 NULL,
2323 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002324}
2325
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326/* --- UTF-16 Codec ------------------------------------------------------- */
2327
Tim Peters772747b2001-08-09 22:21:55 +00002328PyObject *
2329PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002330 Py_ssize_t size,
2331 const char *errors,
2332 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333{
Walter Dörwald69652032004-09-07 20:24:22 +00002334 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2335}
2336
2337PyObject *
2338PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002339 Py_ssize_t size,
2340 const char *errors,
2341 int *byteorder,
2342 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002344 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002345 Py_ssize_t startinpos;
2346 Py_ssize_t endinpos;
2347 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 PyUnicodeObject *unicode;
2349 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002350 const unsigned char *q, *e;
2351 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002352 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002353 /* Offsets from q for retrieving byte pairs in the right order. */
2354#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2355 int ihi = 1, ilo = 0;
2356#else
2357 int ihi = 0, ilo = 1;
2358#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 PyObject *errorHandler = NULL;
2360 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361
2362 /* Note: size will always be longer than the resulting Unicode
2363 character count */
2364 unicode = _PyUnicode_New(size);
2365 if (!unicode)
2366 return NULL;
2367 if (size == 0)
2368 return (PyObject *)unicode;
2369
2370 /* Unpack UTF-16 encoded data */
2371 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002372 q = (unsigned char *)s;
2373 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002376 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002378 /* Check for BOM marks (U+FEFF) in the input and adjust current
2379 byte order setting accordingly. In native mode, the leading BOM
2380 mark is skipped, in all other modes, it is copied to the output
2381 stream as-is (giving a ZWNBSP character). */
2382 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002383 if (size >= 2) {
2384 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002386 if (bom == 0xFEFF) {
2387 q += 2;
2388 bo = -1;
2389 }
2390 else if (bom == 0xFFFE) {
2391 q += 2;
2392 bo = 1;
2393 }
Tim Petersced69f82003-09-16 20:30:58 +00002394#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002395 if (bom == 0xFEFF) {
2396 q += 2;
2397 bo = 1;
2398 }
2399 else if (bom == 0xFFFE) {
2400 q += 2;
2401 bo = -1;
2402 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002403#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406
Tim Peters772747b2001-08-09 22:21:55 +00002407 if (bo == -1) {
2408 /* force LE */
2409 ihi = 1;
2410 ilo = 0;
2411 }
2412 else if (bo == 1) {
2413 /* force BE */
2414 ihi = 0;
2415 ilo = 1;
2416 }
2417
2418 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002419 Py_UNICODE ch;
2420 /* remaining bytes at the end? (size should be even) */
2421 if (e-q<2) {
2422 if (consumed)
2423 break;
2424 errmsg = "truncated data";
2425 startinpos = ((const char *)q)-starts;
2426 endinpos = ((const char *)e)-starts;
2427 goto utf16Error;
2428 /* The remaining input chars are ignored if the callback
2429 chooses to skip the input */
2430 }
2431 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002433 q += 2;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002434
2435 if (ch < 0xD800 || ch > 0xDFFF) {
2436 *p++ = ch;
2437 continue;
2438 }
2439
2440 /* UTF-16 code pair: */
2441 if (q >= e) {
2442 errmsg = "unexpected end of data";
2443 startinpos = (((const char *)q)-2)-starts;
2444 endinpos = ((const char *)e)-starts;
2445 goto utf16Error;
2446 }
2447 if (0xD800 <= ch && ch <= 0xDBFF) {
2448 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2449 q += 2;
2450 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002451#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002452 *p++ = ch;
2453 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002455 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002456#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002457 continue;
2458 }
2459 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002461 startinpos = (((const char *)q)-4)-starts;
2462 endinpos = startinpos+2;
2463 goto utf16Error;
2464 }
2465
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002466 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002467 errmsg = "illegal encoding";
2468 startinpos = (((const char *)q)-2)-starts;
2469 endinpos = startinpos+2;
2470 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002471
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002472 utf16Error:
2473 outpos = p-PyUnicode_AS_UNICODE(unicode);
2474 if (unicode_decode_call_errorhandler(
2475 errors, &errorHandler,
2476 "utf16", errmsg,
2477 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2478 &unicode, &outpos, &p))
2479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 }
2481
2482 if (byteorder)
2483 *byteorder = bo;
2484
Walter Dörwald69652032004-09-07 20:24:22 +00002485 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002486 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002487
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002489 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 goto onError;
2491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 Py_XDECREF(errorHandler);
2493 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 return (PyObject *)unicode;
2495
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return NULL;
2501}
2502
Tim Peters772747b2001-08-09 22:21:55 +00002503PyObject *
2504PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002505 Py_ssize_t size,
2506 const char *errors,
2507 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508{
2509 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002510 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002511 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002512#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002513 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002514#else
2515 const int pairs = 0;
2516#endif
Tim Peters772747b2001-08-09 22:21:55 +00002517 /* Offsets from p for storing byte pairs in the right order. */
2518#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2519 int ihi = 1, ilo = 0;
2520#else
2521 int ihi = 0, ilo = 1;
2522#endif
2523
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002524#define STORECHAR(CH) \
2525 do { \
2526 p[ihi] = ((CH) >> 8) & 0xff; \
2527 p[ilo] = (CH) & 0xff; \
2528 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002529 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002531#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002532 for (i = pairs = 0; i < size; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002533 if (s[i] >= 0x10000)
2534 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002536 /* 2 * (size + pairs + (byteorder == 0)) */
2537 if (size > PY_SSIZE_T_MAX ||
2538 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002539 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002540 nsize = size + pairs + (byteorder == 0);
2541 bytesize = nsize * 2;
2542 if (bytesize / 2 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002543 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002544 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 if (v == NULL)
2546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002548 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002550 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002551 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002552 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002553
2554 if (byteorder == -1) {
2555 /* force LE */
2556 ihi = 1;
2557 ilo = 0;
2558 }
2559 else if (byteorder == 1) {
2560 /* force BE */
2561 ihi = 0;
2562 ilo = 1;
2563 }
2564
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002566 Py_UNICODE ch = *s++;
2567 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002568#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002569 if (ch >= 0x10000) {
2570 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2571 ch = 0xD800 | ((ch-0x10000) >> 10);
2572 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002573#endif
Tim Peters772747b2001-08-09 22:21:55 +00002574 STORECHAR(ch);
2575 if (ch2)
2576 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002579#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580}
2581
2582PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2583{
2584 if (!PyUnicode_Check(unicode)) {
2585 PyErr_BadArgument();
2586 return NULL;
2587 }
2588 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002589 PyUnicode_GET_SIZE(unicode),
2590 NULL,
2591 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592}
2593
2594/* --- Unicode Escape Codec ----------------------------------------------- */
2595
Fredrik Lundh06d12682001-01-24 07:59:11 +00002596static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002597
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002599 Py_ssize_t size,
2600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002603 Py_ssize_t startinpos;
2604 Py_ssize_t endinpos;
2605 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002610 char* message;
2611 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 PyObject *errorHandler = NULL;
2613 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002614
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 /* Escaped strings will always be longer than the resulting
2616 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 length after conversion to the true value.
2618 (but if the error callback returns a long replacement string
2619 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 v = _PyUnicode_New(size);
2621 if (v == NULL)
2622 goto onError;
2623 if (size == 0)
2624 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 while (s < end) {
2630 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002631 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002632 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633
2634 /* Non-escape characters are interpreted as Unicode ordinals */
2635 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002636 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 continue;
2638 }
2639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 /* \ - Escapes */
2642 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002643 c = *s++;
2644 if (s > end)
2645 c = '\0'; /* Invalid after \ */
2646 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002648 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 case '\n': break;
2650 case '\\': *p++ = '\\'; break;
2651 case '\'': *p++ = '\''; break;
2652 case '\"': *p++ = '\"'; break;
2653 case 'b': *p++ = '\b'; break;
2654 case 'f': *p++ = '\014'; break; /* FF */
2655 case 't': *p++ = '\t'; break;
2656 case 'n': *p++ = '\n'; break;
2657 case 'r': *p++ = '\r'; break;
2658 case 'v': *p++ = '\013'; break; /* VT */
2659 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2660
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002661 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 case '0': case '1': case '2': case '3':
2663 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002664 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002665 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002666 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002667 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002668 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002670 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 break;
2672
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002673 /* hex escapes */
2674 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 digits = 2;
2677 message = "truncated \\xXX escape";
2678 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002680 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 digits = 4;
2683 message = "truncated \\uXXXX escape";
2684 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002686 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002687 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 digits = 8;
2689 message = "truncated \\UXXXXXXXX escape";
2690 hexescape:
2691 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 outpos = p-PyUnicode_AS_UNICODE(v);
2693 if (s+digits>end) {
2694 endinpos = size;
2695 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002696 errors, &errorHandler,
2697 "unicodeescape", "end of string in escape sequence",
2698 starts, size, &startinpos, &endinpos, &exc, &s,
2699 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 goto onError;
2701 goto nextByte;
2702 }
2703 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002704 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002705 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 endinpos = (s+i+1)-starts;
2707 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002708 errors, &errorHandler,
2709 "unicodeescape", message,
2710 starts, size, &startinpos, &endinpos, &exc, &s,
2711 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002712 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002714 }
2715 chr = (chr<<4) & ~0xF;
2716 if (c >= '0' && c <= '9')
2717 chr += c - '0';
2718 else if (c >= 'a' && c <= 'f')
2719 chr += 10 + c - 'a';
2720 else
2721 chr += 10 + c - 'A';
2722 }
2723 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002724 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 /* _decoding_error will have already written into the
2726 target buffer. */
2727 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002728 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002729 /* when we get here, chr is a 32-bit unicode character */
2730 if (chr <= 0xffff)
2731 /* UCS-2 character */
2732 *p++ = (Py_UNICODE) chr;
2733 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002734 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002735 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002736#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002737 *p++ = chr;
2738#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 chr -= 0x10000L;
2740 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002741 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002742#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002743 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 endinpos = s-starts;
2745 outpos = p-PyUnicode_AS_UNICODE(v);
2746 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002747 errors, &errorHandler,
2748 "unicodeescape", "illegal Unicode character",
2749 starts, size, &startinpos, &endinpos, &exc, &s,
2750 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002751 goto onError;
2752 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 break;
2754
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002755 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002756 case 'N':
2757 message = "malformed \\N character escape";
2758 if (ucnhash_CAPI == NULL) {
2759 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002760 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002761 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 if (m == NULL)
2763 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002764 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002766 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002768 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002769 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 if (ucnhash_CAPI == NULL)
2771 goto ucnhashError;
2772 }
2773 if (*s == '{') {
2774 const char *start = s+1;
2775 /* look for the closing brace */
2776 while (*s != '}' && s < end)
2777 s++;
2778 if (s > start && s < end && *s == '}') {
2779 /* found a name. look it up in the unicode database */
2780 message = "unknown Unicode character name";
2781 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002782 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002783 goto store;
2784 }
2785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 endinpos = s-starts;
2787 outpos = p-PyUnicode_AS_UNICODE(v);
2788 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002789 errors, &errorHandler,
2790 "unicodeescape", message,
2791 starts, size, &startinpos, &endinpos, &exc, &s,
2792 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 break;
2795
2796 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002797 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 message = "\\ at end of string";
2799 s--;
2800 endinpos = s-starts;
2801 outpos = p-PyUnicode_AS_UNICODE(v);
2802 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002803 errors, &errorHandler,
2804 "unicodeescape", message,
2805 starts, size, &startinpos, &endinpos, &exc, &s,
2806 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002807 goto onError;
2808 }
2809 else {
2810 *p++ = '\\';
2811 *p++ = (unsigned char)s[-1];
2812 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002813 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002815 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002818 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002823
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002824 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002825 PyErr_SetString(
2826 PyExc_UnicodeError,
2827 "\\N escapes not supported (can't load unicodedata module)"
2828 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002829 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 Py_XDECREF(errorHandler);
2831 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002832 return NULL;
2833
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002834 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 Py_XDECREF(errorHandler);
2837 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 return NULL;
2839}
2840
2841/* Return a Unicode-Escape string version of the Unicode object.
2842
2843 If quotes is true, the string is enclosed in u"" or u'' quotes as
2844 appropriate.
2845
2846*/
2847
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002848Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002849 Py_ssize_t size,
2850 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002851{
2852 /* like wcschr, but doesn't stop at NULL characters */
2853
2854 while (size-- > 0) {
2855 if (*s == ch)
2856 return s;
2857 s++;
2858 }
2859
2860 return NULL;
2861}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002862
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863static
2864PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 int quotes)
2867{
2868 PyObject *repr;
2869 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002871 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002872#ifdef Py_UNICODE_WIDE
2873 const Py_ssize_t expandsize = 10;
2874#else
2875 const Py_ssize_t expandsize = 6;
2876#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877
Neal Norwitz17753ec2006-08-21 22:21:19 +00002878 /* XXX(nnorwitz): rather than over-allocating, it would be
2879 better to choose a different scheme. Perhaps scan the
2880 first N-chars of the string and allocate based on that size.
2881 */
2882 /* Initial allocation is based on the longest-possible unichr
2883 escape.
2884
2885 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2886 unichr, so in this case it's the longest unichr escape. In
2887 narrow (UTF-16) builds this is five chars per source unichr
2888 since there are two unichrs in the surrogate pair, so in narrow
2889 (UTF-16) builds it's not the longest unichr escape.
2890
2891 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2892 so in the narrow (UTF-16) build case it's the longest unichr
2893 escape.
2894 */
2895
Neal Norwitze7d8be82008-07-31 17:17:14 +00002896 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002897 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002898
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002899 repr = PyString_FromStringAndSize(NULL,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002900 2
2901 + expandsize*size
2902 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 if (repr == NULL)
2904 return NULL;
2905
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002906 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907
2908 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002910 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 !findchar(s, size, '"')) ? '"' : '\'';
2912 }
2913 while (size-- > 0) {
2914 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002915
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002916 /* Escape quotes and backslashes */
2917 if ((quotes &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002918 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 *p++ = '\\';
2920 *p++ = (char) ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002921 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002922 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002923
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002924#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002925 /* Map 21-bit characters to '\U00xxxxxx' */
2926 else if (ch >= 0x10000) {
2927 *p++ = '\\';
2928 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2931 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2932 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2933 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2934 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2935 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002936 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002937 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002938 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002939#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002940 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2941 else if (ch >= 0xD800 && ch < 0xDC00) {
2942 Py_UNICODE ch2;
2943 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002944
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002945 ch2 = *s++;
2946 size--;
2947 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2948 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2949 *p++ = '\\';
2950 *p++ = 'U';
2951 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2953 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2954 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2955 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2956 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2957 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2958 *p++ = hexdigit[ucs & 0x0000000F];
2959 continue;
2960 }
2961 /* Fall through: isolated surrogates are copied as-is */
2962 s--;
2963 size++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002964 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002965#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 *p++ = '\\';
2970 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002971 *p++ = hexdigit[(ch >> 12) & 0x000F];
2972 *p++ = hexdigit[(ch >> 8) & 0x000F];
2973 *p++ = hexdigit[(ch >> 4) & 0x000F];
2974 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002976
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002977 /* Map special whitespace to '\t', \n', '\r' */
2978 else if (ch == '\t') {
2979 *p++ = '\\';
2980 *p++ = 't';
2981 }
2982 else if (ch == '\n') {
2983 *p++ = '\\';
2984 *p++ = 'n';
2985 }
2986 else if (ch == '\r') {
2987 *p++ = '\\';
2988 *p++ = 'r';
2989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002991 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002992 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002994 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002995 *p++ = hexdigit[(ch >> 4) & 0x000F];
2996 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002997 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002998
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 /* Copy everything else as-is */
3000 else
3001 *p++ = (char) ch;
3002 }
3003 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003004 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005
3006 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003007 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 return repr;
3009}
3010
3011PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003012 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013{
3014 return unicodeescape_string(s, size, 0);
3015}
3016
3017PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3018{
3019 if (!PyUnicode_Check(unicode)) {
3020 PyErr_BadArgument();
3021 return NULL;
3022 }
3023 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003024 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025}
3026
3027/* --- Raw Unicode Escape Codec ------------------------------------------- */
3028
3029PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003030 Py_ssize_t size,
3031 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003034 Py_ssize_t startinpos;
3035 Py_ssize_t endinpos;
3036 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 const char *end;
3040 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 PyObject *errorHandler = NULL;
3042 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003043
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 /* Escaped strings will always be longer than the resulting
3045 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 length after conversion to the true value. (But decoding error
3047 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 v = _PyUnicode_New(size);
3049 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003052 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 end = s + size;
3055 while (s < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003056 unsigned char c;
3057 Py_UCS4 x;
3058 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003059 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003061 /* Non-escape characters are interpreted as Unicode ordinals */
3062 if (*s != '\\') {
3063 *p++ = (unsigned char)*s++;
3064 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003065 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003066 startinpos = s-starts;
3067
3068 /* \u-escapes are only interpreted iff the number of leading
3069 backslashes if odd */
3070 bs = s;
3071 for (;s < end;) {
3072 if (*s != '\\')
3073 break;
3074 *p++ = (unsigned char)*s++;
3075 }
3076 if (((s - bs) & 1) == 0 ||
3077 s >= end ||
3078 (*s != 'u' && *s != 'U')) {
3079 continue;
3080 }
3081 p--;
3082 count = *s=='u' ? 4 : 8;
3083 s++;
3084
3085 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3086 outpos = p-PyUnicode_AS_UNICODE(v);
3087 for (x = 0, i = 0; i < count; ++i, ++s) {
3088 c = (unsigned char)*s;
3089 if (!isxdigit(c)) {
3090 endinpos = s-starts;
3091 if (unicode_decode_call_errorhandler(
3092 errors, &errorHandler,
3093 "rawunicodeescape", "truncated \\uXXXX",
3094 starts, size, &startinpos, &endinpos, &exc, &s,
3095 &v, &outpos, &p))
3096 goto onError;
3097 goto nextByte;
3098 }
3099 x = (x<<4) & ~0xF;
3100 if (c >= '0' && c <= '9')
3101 x += c - '0';
3102 else if (c >= 'a' && c <= 'f')
3103 x += 10 + c - 'a';
3104 else
3105 x += 10 + c - 'A';
3106 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003107 if (x <= 0xffff)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003108 /* UCS-2 character */
3109 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003110 else if (x <= 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003111 /* UCS-4 character. Either store directly, or as
3112 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003113#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003114 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003115#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003116 x -= 0x10000L;
3117 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3118 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003119#endif
3120 } else {
3121 endinpos = s-starts;
3122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003123 if (unicode_decode_call_errorhandler(
3124 errors, &errorHandler,
3125 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003126 starts, size, &startinpos, &endinpos, &exc, &s,
3127 &v, &outpos, &p))
3128 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003129 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003130 nextByte:
3131 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003133 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 Py_XDECREF(errorHandler);
3136 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 Py_XDECREF(errorHandler);
3142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 return NULL;
3144}
3145
3146PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003147 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148{
3149 PyObject *repr;
3150 char *p;
3151 char *q;
3152
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003153 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003155 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003157 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003159
Neal Norwitze7d8be82008-07-31 17:17:14 +00003160 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003161 return PyErr_NoMemory();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003162
Neal Norwitze7d8be82008-07-31 17:17:14 +00003163 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 if (repr == NULL)
3165 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003166 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003167 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003169 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 while (size-- > 0) {
3171 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003172#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003173 /* Map 32-bit characters to '\Uxxxxxxxx' */
3174 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175 *p++ = '\\';
3176 *p++ = 'U';
3177 *p++ = hexdigit[(ch >> 28) & 0xf];
3178 *p++ = hexdigit[(ch >> 24) & 0xf];
3179 *p++ = hexdigit[(ch >> 20) & 0xf];
3180 *p++ = hexdigit[(ch >> 16) & 0xf];
3181 *p++ = hexdigit[(ch >> 12) & 0xf];
3182 *p++ = hexdigit[(ch >> 8) & 0xf];
3183 *p++ = hexdigit[(ch >> 4) & 0xf];
3184 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003185 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003186 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003187#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003188 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3189 if (ch >= 0xD800 && ch < 0xDC00) {
3190 Py_UNICODE ch2;
3191 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003192
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003193 ch2 = *s++;
3194 size--;
3195 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3196 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3197 *p++ = '\\';
3198 *p++ = 'U';
3199 *p++ = hexdigit[(ucs >> 28) & 0xf];
3200 *p++ = hexdigit[(ucs >> 24) & 0xf];
3201 *p++ = hexdigit[(ucs >> 20) & 0xf];
3202 *p++ = hexdigit[(ucs >> 16) & 0xf];
3203 *p++ = hexdigit[(ucs >> 12) & 0xf];
3204 *p++ = hexdigit[(ucs >> 8) & 0xf];
3205 *p++ = hexdigit[(ucs >> 4) & 0xf];
3206 *p++ = hexdigit[ucs & 0xf];
3207 continue;
3208 }
3209 /* Fall through: isolated surrogates are copied as-is */
3210 s--;
3211 size++;
3212 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003213#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003214 /* Map 16-bit characters to '\uxxxx' */
3215 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 *p++ = '\\';
3217 *p++ = 'u';
3218 *p++ = hexdigit[(ch >> 12) & 0xf];
3219 *p++ = hexdigit[(ch >> 8) & 0xf];
3220 *p++ = hexdigit[(ch >> 4) & 0xf];
3221 *p++ = hexdigit[ch & 15];
3222 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003223 /* Copy everything else as-is */
3224 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 *p++ = (char) ch;
3226 }
3227 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003228 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 return repr;
3230}
3231
3232PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3233{
3234 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003235 PyErr_BadArgument();
3236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
3238 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003239 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240}
3241
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003242/* --- Unicode Internal Codec ------------------------------------------- */
3243
3244PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003245 Py_ssize_t size,
3246 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003247{
3248 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003249 Py_ssize_t startinpos;
3250 Py_ssize_t endinpos;
3251 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003252 PyUnicodeObject *v;
3253 Py_UNICODE *p;
3254 const char *end;
3255 const char *reason;
3256 PyObject *errorHandler = NULL;
3257 PyObject *exc = NULL;
3258
Neal Norwitzd43069c2006-01-08 01:12:10 +00003259#ifdef Py_UNICODE_WIDE
3260 Py_UNICODE unimax = PyUnicode_GetMax();
3261#endif
3262
Armin Rigo7ccbca92006-10-04 12:17:45 +00003263 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003264 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3265 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003266 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003267 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003268 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003269 p = PyUnicode_AS_UNICODE(v);
3270 end = s + size;
3271
3272 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003273 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003274 /* We have to sanity check the raw data, otherwise doom looms for
3275 some malformed UCS-4 data. */
3276 if (
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003277#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003278 *p > unimax || *p < 0 ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003279#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003280 end-s < Py_UNICODE_SIZE
3281 )
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003282 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003283 startinpos = s - starts;
3284 if (end-s < Py_UNICODE_SIZE) {
3285 endinpos = end-starts;
3286 reason = "truncated input";
3287 }
3288 else {
3289 endinpos = s - starts + Py_UNICODE_SIZE;
3290 reason = "illegal code point (> 0x10FFFF)";
3291 }
3292 outpos = p - PyUnicode_AS_UNICODE(v);
3293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "unicode_internal", reason,
3296 starts, size, &startinpos, &endinpos, &exc, &s,
Benjamin Peterson828a7062008-12-27 17:05:29 +00003297 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003298 goto onError;
3299 }
3300 }
3301 else {
3302 p++;
3303 s += Py_UNICODE_SIZE;
3304 }
3305 }
3306
Martin v. Löwis412fb672006-04-13 06:34:32 +00003307 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003308 goto onError;
3309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
3311 return (PyObject *)v;
3312
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003313 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003314 Py_XDECREF(v);
3315 Py_XDECREF(errorHandler);
3316 Py_XDECREF(exc);
3317 return NULL;
3318}
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320/* --- Latin-1 Codec ------------------------------------------------------ */
3321
3322PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003323 Py_ssize_t size,
3324 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325{
3326 PyUnicodeObject *v;
3327 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003328
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003330 if (size == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003331 Py_UNICODE r = *(unsigned char*)s;
3332 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003333 }
3334
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 v = _PyUnicode_New(size);
3336 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003337 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003339 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 p = PyUnicode_AS_UNICODE(v);
3341 while (size-- > 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003342 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003344
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003345 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 Py_XDECREF(v);
3347 return NULL;
3348}
3349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350/* create or adjust a UnicodeEncodeError */
3351static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003352 const char *encoding,
3353 const Py_UNICODE *unicode, Py_ssize_t size,
3354 Py_ssize_t startpos, Py_ssize_t endpos,
3355 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357 if (*exceptionObject == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003358 *exceptionObject = PyUnicodeEncodeError_Create(
3359 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003362 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3363 goto onError;
3364 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3365 goto onError;
3366 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3367 goto onError;
3368 return;
3369 onError:
3370 Py_DECREF(*exceptionObject);
3371 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 }
3373}
3374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375/* raises a UnicodeEncodeError */
3376static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003377 const char *encoding,
3378 const Py_UNICODE *unicode, Py_ssize_t size,
3379 Py_ssize_t startpos, Py_ssize_t endpos,
3380 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381{
3382 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003383 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003385 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386}
3387
3388/* error handling callback helper:
3389 build arguments, call the callback and check the arguments,
3390 put the result into newpos and return the replacement string, which
3391 has to be freed by the caller */
3392static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003393 PyObject **errorHandler,
3394 const char *encoding, const char *reason,
3395 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3396 Py_ssize_t startpos, Py_ssize_t endpos,
3397 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003399 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400
3401 PyObject *restuple;
3402 PyObject *resunicode;
3403
3404 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003405 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 }
3409
3410 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003411 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414
3415 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003416 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00003420 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003421 Py_DECREF(restuple);
3422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 }
3424 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003425 &resunicode, newpos)) {
3426 Py_DECREF(restuple);
3427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 }
3429 if (*newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003430 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003431 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003432 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3433 Py_DECREF(restuple);
3434 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 Py_INCREF(resunicode);
3437 Py_DECREF(restuple);
3438 return resunicode;
3439}
3440
3441static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003442 Py_ssize_t size,
3443 const char *errors,
3444 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445{
3446 /* output object */
3447 PyObject *res;
3448 /* pointers to the beginning and end+1 of input */
3449 const Py_UNICODE *startp = p;
3450 const Py_UNICODE *endp = p + size;
3451 /* pointer to the beginning of the unencodable characters */
3452 /* const Py_UNICODE *badp = NULL; */
3453 /* pointer into the output */
3454 char *str;
3455 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003456 Py_ssize_t respos = 0;
3457 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003458 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3459 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 PyObject *errorHandler = NULL;
3461 PyObject *exc = NULL;
3462 /* the following variable is used for caching string comparisons
3463 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3464 int known_errorHandler = -1;
3465
3466 /* allocate enough for a simple encoding without
3467 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003468 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 if (res == NULL)
3470 goto onError;
3471 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003472 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003473 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 ressize = size;
3475
3476 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003477 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003479 /* can we encode this? */
3480 if (c<limit) {
3481 /* no overflow check, because we know that the space is enough */
3482 *str++ = (char)c;
3483 ++p;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003484 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003485 else {
3486 Py_ssize_t unicodepos = p-startp;
3487 Py_ssize_t requiredsize;
3488 PyObject *repunicode;
3489 Py_ssize_t repsize;
3490 Py_ssize_t newpos;
3491 Py_ssize_t respos;
3492 Py_UNICODE *uni2;
3493 /* startpos for collecting unencodable chars */
3494 const Py_UNICODE *collstart = p;
3495 const Py_UNICODE *collend = p;
3496 /* find all unecodable characters */
3497 while ((collend < endp) && ((*collend)>=limit))
3498 ++collend;
3499 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3500 if (known_errorHandler==-1) {
3501 if ((errors==NULL) || (!strcmp(errors, "strict")))
3502 known_errorHandler = 1;
3503 else if (!strcmp(errors, "replace"))
3504 known_errorHandler = 2;
3505 else if (!strcmp(errors, "ignore"))
3506 known_errorHandler = 3;
3507 else if (!strcmp(errors, "xmlcharrefreplace"))
3508 known_errorHandler = 4;
3509 else
3510 known_errorHandler = 0;
3511 }
3512 switch (known_errorHandler) {
3513 case 1: /* strict */
3514 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3515 goto onError;
3516 case 2: /* replace */
3517 while (collstart++<collend)
3518 *str++ = '?'; /* fall through */
3519 case 3: /* ignore */
3520 p = collend;
3521 break;
3522 case 4: /* xmlcharrefreplace */
3523 respos = str-PyString_AS_STRING(res);
3524 /* determine replacement size (temporarily (mis)uses p) */
3525 for (p = collstart, repsize = 0; p < collend; ++p) {
3526 if (*p<10)
3527 repsize += 2+1+1;
3528 else if (*p<100)
3529 repsize += 2+2+1;
3530 else if (*p<1000)
3531 repsize += 2+3+1;
3532 else if (*p<10000)
3533 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003534#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003535 else
3536 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003537#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003538 else if (*p<100000)
3539 repsize += 2+5+1;
3540 else if (*p<1000000)
3541 repsize += 2+6+1;
3542 else
3543 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003544#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003545 }
3546 requiredsize = respos+repsize+(endp-collend);
3547 if (requiredsize > ressize) {
3548 if (requiredsize<2*ressize)
3549 requiredsize = 2*ressize;
3550 if (_PyString_Resize(&res, requiredsize))
3551 goto onError;
3552 str = PyString_AS_STRING(res) + respos;
3553 ressize = requiredsize;
3554 }
3555 /* generate replacement (temporarily (mis)uses p) */
3556 for (p = collstart; p < collend; ++p) {
3557 str += sprintf(str, "&#%d;", (int)*p);
3558 }
3559 p = collend;
3560 break;
3561 default:
3562 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3563 encoding, reason, startp, size, &exc,
3564 collstart-startp, collend-startp, &newpos);
3565 if (repunicode == NULL)
3566 goto onError;
3567 /* need more space? (at least enough for what we
3568 have+the replacement+the rest of the string, so
3569 we won't have to check space for encodable characters) */
3570 respos = str-PyString_AS_STRING(res);
3571 repsize = PyUnicode_GET_SIZE(repunicode);
3572 requiredsize = respos+repsize+(endp-collend);
3573 if (requiredsize > ressize) {
3574 if (requiredsize<2*ressize)
3575 requiredsize = 2*ressize;
3576 if (_PyString_Resize(&res, requiredsize)) {
3577 Py_DECREF(repunicode);
3578 goto onError;
3579 }
3580 str = PyString_AS_STRING(res) + respos;
3581 ressize = requiredsize;
3582 }
3583 /* check if there is anything unencodable in the replacement
3584 and copy it to the output */
3585 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3586 c = *uni2;
3587 if (c >= limit) {
3588 raise_encode_exception(&exc, encoding, startp, size,
3589 unicodepos, unicodepos+1, reason);
3590 Py_DECREF(repunicode);
3591 goto onError;
3592 }
3593 *str = (char)c;
3594 }
3595 p = startp + newpos;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003596 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003597 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003598 }
3599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003601 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 if (respos<ressize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003603 /* If this falls res will be NULL */
3604 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
3607 return res;
3608
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003609 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 Py_XDECREF(res);
3611 Py_XDECREF(errorHandler);
3612 Py_XDECREF(exc);
3613 return NULL;
3614}
3615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003617 Py_ssize_t size,
3618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621}
3622
3623PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3624{
3625 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003626 PyErr_BadArgument();
3627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 }
3629 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003630 PyUnicode_GET_SIZE(unicode),
3631 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632}
3633
3634/* --- 7-bit ASCII Codec -------------------------------------------------- */
3635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003637 Py_ssize_t size,
3638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 PyUnicodeObject *v;
3642 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003643 Py_ssize_t startinpos;
3644 Py_ssize_t endinpos;
3645 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 const char *e;
3647 PyObject *errorHandler = NULL;
3648 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003651 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003652 Py_UNICODE r = *(unsigned char*)s;
3653 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003654 }
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 v = _PyUnicode_New(size);
3657 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003660 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 e = s + size;
3663 while (s < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003664 register unsigned char c = (unsigned char)*s;
3665 if (c < 128) {
3666 *p++ = c;
3667 ++s;
3668 }
3669 else {
3670 startinpos = s-starts;
3671 endinpos = startinpos + 1;
3672 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3673 if (unicode_decode_call_errorhandler(
3674 errors, &errorHandler,
3675 "ascii", "ordinal not in range(128)",
3676 starts, size, &startinpos, &endinpos, &exc, &s,
3677 &v, &outpos, &p))
3678 goto onError;
3679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003681 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003682 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003687
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_XDECREF(errorHandler);
3691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 return NULL;
3693}
3694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003696 Py_ssize_t size,
3697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700}
3701
3702PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3703{
3704 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003705 PyErr_BadArgument();
3706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 }
3708 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003709 PyUnicode_GET_SIZE(unicode),
3710 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711}
3712
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003713#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003714
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003715/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003716
Hirokazu Yamamoto68e075e2009-03-21 13:04:41 +00003717#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003718#define NEED_RETRY
3719#endif
3720
3721/* XXX This code is limited to "true" double-byte encodings, as
3722 a) it assumes an incomplete character consists of a single byte, and
3723 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003724 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003725
3726static int is_dbcs_lead_byte(const char *s, int offset)
3727{
3728 const char *curr = s + offset;
3729
3730 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003731 const char *prev = CharPrev(s, curr);
3732 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003733 }
3734 return 0;
3735}
3736
3737/*
3738 * Decode MBCS string into unicode object. If 'final' is set, converts
3739 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3740 */
3741static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003742 const char *s, /* MBCS string */
3743 int size, /* sizeof MBCS string */
3744 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003745{
3746 Py_UNICODE *p;
3747 Py_ssize_t n = 0;
3748 int usize = 0;
3749
3750 assert(size >= 0);
3751
3752 /* Skip trailing lead-byte unless 'final' is set */
3753 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003754 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003755
3756 /* First get the size of the result */
3757 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003758 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3759 if (usize == 0) {
3760 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3761 return -1;
3762 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003763 }
3764
3765 if (*v == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003766 /* Create unicode object */
3767 *v = _PyUnicode_New(usize);
3768 if (*v == NULL)
3769 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003770 }
3771 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003772 /* Extend unicode object */
3773 n = PyUnicode_GET_SIZE(*v);
3774 if (_PyUnicode_Resize(v, n + usize) < 0)
3775 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003776 }
3777
3778 /* Do the conversion */
3779 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003780 p = PyUnicode_AS_UNICODE(*v) + n;
3781 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3782 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3783 return -1;
3784 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003785 }
3786
3787 return size;
3788}
3789
3790PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003791 Py_ssize_t size,
3792 const char *errors,
3793 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003794{
3795 PyUnicodeObject *v = NULL;
3796 int done;
3797
3798 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003799 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003800
3801#ifdef NEED_RETRY
3802 retry:
3803 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003804 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003805 else
3806#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003807 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003808
3809 if (done < 0) {
3810 Py_XDECREF(v);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003811 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003812 }
3813
3814 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003815 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003816
3817#ifdef NEED_RETRY
3818 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003819 s += done;
3820 size -= done;
3821 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003822 }
3823#endif
3824
3825 return (PyObject *)v;
3826}
3827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003828PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003829 Py_ssize_t size,
3830 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003831{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003832 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3833}
3834
3835/*
3836 * Convert unicode into string object (MBCS).
3837 * Returns 0 if succeed, -1 otherwise.
3838 */
3839static int encode_mbcs(PyObject **repr,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003840 const Py_UNICODE *p, /* unicode */
3841 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003842{
3843 int mbcssize = 0;
3844 Py_ssize_t n = 0;
3845
3846 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003847
3848 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003849 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003850 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3851 if (mbcssize == 0) {
3852 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3853 return -1;
3854 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003855 }
3856
Martin v. Löwisd8251432006-06-14 05:21:04 +00003857 if (*repr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003858 /* Create string object */
3859 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3860 if (*repr == NULL)
3861 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862 }
3863 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003864 /* Extend string object */
3865 n = PyString_Size(*repr);
3866 if (_PyString_Resize(repr, n + mbcssize) < 0)
3867 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868 }
3869
3870 /* Do the conversion */
3871 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003872 char *s = PyString_AS_STRING(*repr) + n;
3873 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3874 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3875 return -1;
3876 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 }
3878
3879 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003880}
3881
3882PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003883 Py_ssize_t size,
3884 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003885{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 PyObject *repr = NULL;
3887 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003888
Martin v. Löwisd8251432006-06-14 05:21:04 +00003889#ifdef NEED_RETRY
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003890 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003892 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 else
3894#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003895 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897 if (ret < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003898 Py_XDECREF(repr);
3899 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003901
3902#ifdef NEED_RETRY
3903 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003904 p += INT_MAX;
3905 size -= INT_MAX;
3906 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 }
3908#endif
3909
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003910 return repr;
3911}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003912
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003913PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3914{
3915 if (!PyUnicode_Check(unicode)) {
3916 PyErr_BadArgument();
3917 return NULL;
3918 }
3919 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003920 PyUnicode_GET_SIZE(unicode),
3921 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003922}
3923
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924#undef NEED_RETRY
3925
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003926#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003927
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928/* --- Character Mapping Codec -------------------------------------------- */
3929
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003931 Py_ssize_t size,
3932 PyObject *mapping,
3933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t startinpos;
3937 Py_ssize_t endinpos;
3938 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 PyUnicodeObject *v;
3941 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003942 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 PyObject *errorHandler = NULL;
3944 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003945 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 /* Default to Latin-1 */
3949 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003950 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951
3952 v = _PyUnicode_New(size);
3953 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003956 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003960 mapstring = PyUnicode_AS_UNICODE(mapping);
3961 maplen = PyUnicode_GET_SIZE(mapping);
3962 while (s < e) {
3963 unsigned char ch = *s;
3964 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003966 if (ch < maplen)
3967 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003969 if (x == 0xfffe) {
3970 /* undefined mapping */
3971 outpos = p-PyUnicode_AS_UNICODE(v);
3972 startinpos = s-starts;
3973 endinpos = startinpos+1;
3974 if (unicode_decode_call_errorhandler(
3975 errors, &errorHandler,
3976 "charmap", "character maps to <undefined>",
3977 starts, size, &startinpos, &endinpos, &exc, &s,
3978 &v, &outpos, &p)) {
3979 goto onError;
3980 }
3981 continue;
3982 }
3983 *p++ = x;
3984 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003985 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 }
3987 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003988 while (s < e) {
3989 unsigned char ch = *s;
3990 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003991
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003992 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3993 w = PyInt_FromLong((long)ch);
3994 if (w == NULL)
3995 goto onError;
3996 x = PyObject_GetItem(mapping, w);
3997 Py_DECREF(w);
3998 if (x == NULL) {
3999 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4000 /* No mapping found means: mapping is undefined. */
4001 PyErr_Clear();
4002 x = Py_None;
4003 Py_INCREF(x);
4004 } else
4005 goto onError;
4006 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004007
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004008 /* Apply mapping */
4009 if (PyInt_Check(x)) {
4010 long value = PyInt_AS_LONG(x);
4011 if (value < 0 || value > 65535) {
4012 PyErr_SetString(PyExc_TypeError,
4013 "character mapping must be in range(65536)");
4014 Py_DECREF(x);
4015 goto onError;
4016 }
4017 *p++ = (Py_UNICODE)value;
4018 }
4019 else if (x == Py_None) {
4020 /* undefined mapping */
4021 outpos = p-PyUnicode_AS_UNICODE(v);
4022 startinpos = s-starts;
4023 endinpos = startinpos+1;
4024 if (unicode_decode_call_errorhandler(
4025 errors, &errorHandler,
4026 "charmap", "character maps to <undefined>",
4027 starts, size, &startinpos, &endinpos, &exc, &s,
4028 &v, &outpos, &p)) {
4029 Py_DECREF(x);
4030 goto onError;
4031 }
4032 Py_DECREF(x);
4033 continue;
4034 }
4035 else if (PyUnicode_Check(x)) {
4036 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004037
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004038 if (targetsize == 1)
4039 /* 1-1 mapping */
4040 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004041
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004042 else if (targetsize > 1) {
4043 /* 1-n mapping */
4044 if (targetsize > extrachars) {
4045 /* resize first */
4046 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4047 Py_ssize_t needed = (targetsize - extrachars) + \
4048 (targetsize << 2);
4049 extrachars += needed;
4050 /* XXX overflow detection missing */
4051 if (_PyUnicode_Resize(&v,
4052 PyUnicode_GET_SIZE(v) + needed) < 0) {
4053 Py_DECREF(x);
4054 goto onError;
4055 }
4056 p = PyUnicode_AS_UNICODE(v) + oldpos;
4057 }
4058 Py_UNICODE_COPY(p,
4059 PyUnicode_AS_UNICODE(x),
4060 targetsize);
4061 p += targetsize;
4062 extrachars -= targetsize;
4063 }
4064 /* 1-0 mapping: skip the character */
4065 }
4066 else {
4067 /* wrong return value */
4068 PyErr_SetString(PyExc_TypeError,
4069 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004070 Py_DECREF(x);
4071 goto onError;
4072 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004073 Py_DECREF(x);
4074 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
4077 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004078 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 Py_XDECREF(errorHandler);
4081 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004083
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004084 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 Py_XDECREF(errorHandler);
4086 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 Py_XDECREF(v);
4088 return NULL;
4089}
4090
Martin v. Löwis3f767792006-06-04 19:36:28 +00004091/* Charmap encoding: the lookup table */
4092
4093struct encoding_map{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004094 PyObject_HEAD
4095 unsigned char level1[32];
4096 int count2, count3;
4097 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004098};
4099
4100static PyObject*
4101encoding_map_size(PyObject *obj, PyObject* args)
4102{
4103 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004104 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004105 128*map->count3);
4106}
4107
4108static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004109 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004110 PyDoc_STR("Return the size (in bytes) of this object") },
4111 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004112};
4113
4114static void
4115encoding_map_dealloc(PyObject* o)
4116{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004117 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004118}
4119
4120static PyTypeObject EncodingMapType = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004121 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004122 "EncodingMap", /*tp_name*/
4123 sizeof(struct encoding_map), /*tp_basicsize*/
4124 0, /*tp_itemsize*/
4125 /* methods */
4126 encoding_map_dealloc, /*tp_dealloc*/
4127 0, /*tp_print*/
4128 0, /*tp_getattr*/
4129 0, /*tp_setattr*/
4130 0, /*tp_compare*/
4131 0, /*tp_repr*/
4132 0, /*tp_as_number*/
4133 0, /*tp_as_sequence*/
4134 0, /*tp_as_mapping*/
4135 0, /*tp_hash*/
4136 0, /*tp_call*/
4137 0, /*tp_str*/
4138 0, /*tp_getattro*/
4139 0, /*tp_setattro*/
4140 0, /*tp_as_buffer*/
4141 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4142 0, /*tp_doc*/
4143 0, /*tp_traverse*/
4144 0, /*tp_clear*/
4145 0, /*tp_richcompare*/
4146 0, /*tp_weaklistoffset*/
4147 0, /*tp_iter*/
4148 0, /*tp_iternext*/
4149 encoding_map_methods, /*tp_methods*/
4150 0, /*tp_members*/
4151 0, /*tp_getset*/
4152 0, /*tp_base*/
4153 0, /*tp_dict*/
4154 0, /*tp_descr_get*/
4155 0, /*tp_descr_set*/
4156 0, /*tp_dictoffset*/
4157 0, /*tp_init*/
4158 0, /*tp_alloc*/
4159 0, /*tp_new*/
4160 0, /*tp_free*/
4161 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004162};
4163
4164PyObject*
4165PyUnicode_BuildEncodingMap(PyObject* string)
4166{
4167 Py_UNICODE *decode;
4168 PyObject *result;
4169 struct encoding_map *mresult;
4170 int i;
4171 int need_dict = 0;
4172 unsigned char level1[32];
4173 unsigned char level2[512];
4174 unsigned char *mlevel1, *mlevel2, *mlevel3;
4175 int count2 = 0, count3 = 0;
4176
4177 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4178 PyErr_BadArgument();
4179 return NULL;
4180 }
4181 decode = PyUnicode_AS_UNICODE(string);
4182 memset(level1, 0xFF, sizeof level1);
4183 memset(level2, 0xFF, sizeof level2);
4184
4185 /* If there isn't a one-to-one mapping of NULL to \0,
4186 or if there are non-BMP characters, we need to use
4187 a mapping dictionary. */
4188 if (decode[0] != 0)
4189 need_dict = 1;
4190 for (i = 1; i < 256; i++) {
4191 int l1, l2;
4192 if (decode[i] == 0
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004193#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004194 || decode[i] > 0xFFFF
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004195#endif
4196 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004197 need_dict = 1;
4198 break;
4199 }
4200 if (decode[i] == 0xFFFE)
4201 /* unmapped character */
4202 continue;
4203 l1 = decode[i] >> 11;
4204 l2 = decode[i] >> 7;
4205 if (level1[l1] == 0xFF)
4206 level1[l1] = count2++;
4207 if (level2[l2] == 0xFF)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004208 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004209 }
4210
4211 if (count2 >= 0xFF || count3 >= 0xFF)
4212 need_dict = 1;
4213
4214 if (need_dict) {
4215 PyObject *result = PyDict_New();
4216 PyObject *key, *value;
4217 if (!result)
4218 return NULL;
4219 for (i = 0; i < 256; i++) {
4220 key = value = NULL;
4221 key = PyInt_FromLong(decode[i]);
4222 value = PyInt_FromLong(i);
4223 if (!key || !value)
4224 goto failed1;
4225 if (PyDict_SetItem(result, key, value) == -1)
4226 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004227 Py_DECREF(key);
4228 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004229 }
4230 return result;
4231 failed1:
4232 Py_XDECREF(key);
4233 Py_XDECREF(value);
4234 Py_DECREF(result);
4235 return NULL;
4236 }
4237
4238 /* Create a three-level trie */
4239 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4240 16*count2 + 128*count3 - 1);
4241 if (!result)
4242 return PyErr_NoMemory();
4243 PyObject_Init(result, &EncodingMapType);
4244 mresult = (struct encoding_map*)result;
4245 mresult->count2 = count2;
4246 mresult->count3 = count3;
4247 mlevel1 = mresult->level1;
4248 mlevel2 = mresult->level23;
4249 mlevel3 = mresult->level23 + 16*count2;
4250 memcpy(mlevel1, level1, 32);
4251 memset(mlevel2, 0xFF, 16*count2);
4252 memset(mlevel3, 0, 128*count3);
4253 count3 = 0;
4254 for (i = 1; i < 256; i++) {
4255 int o1, o2, o3, i2, i3;
4256 if (decode[i] == 0xFFFE)
4257 /* unmapped character */
4258 continue;
4259 o1 = decode[i]>>11;
4260 o2 = (decode[i]>>7) & 0xF;
4261 i2 = 16*mlevel1[o1] + o2;
4262 if (mlevel2[i2] == 0xFF)
4263 mlevel2[i2] = count3++;
4264 o3 = decode[i] & 0x7F;
4265 i3 = 128*mlevel2[i2] + o3;
4266 mlevel3[i3] = i;
4267 }
4268 return result;
4269}
4270
4271static int
4272encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4273{
4274 struct encoding_map *map = (struct encoding_map*)mapping;
4275 int l1 = c>>11;
4276 int l2 = (c>>7) & 0xF;
4277 int l3 = c & 0x7F;
4278 int i;
4279
4280#ifdef Py_UNICODE_WIDE
4281 if (c > 0xFFFF) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004282 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004283 }
4284#endif
4285 if (c == 0)
4286 return 0;
4287 /* level 1*/
4288 i = map->level1[l1];
4289 if (i == 0xFF) {
4290 return -1;
4291 }
4292 /* level 2*/
4293 i = map->level23[16*i+l2];
4294 if (i == 0xFF) {
4295 return -1;
4296 }
4297 /* level 3 */
4298 i = map->level23[16*map->count2 + 128*i + l3];
4299 if (i == 0) {
4300 return -1;
4301 }
4302 return i;
4303}
4304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305/* Lookup the character ch in the mapping. If the character
4306 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004307 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 PyObject *w = PyInt_FromLong((long)c);
4311 PyObject *x;
4312
4313 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 x = PyObject_GetItem(mapping, w);
4316 Py_DECREF(w);
4317 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004318 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4319 /* No mapping found means: mapping is undefined. */
4320 PyErr_Clear();
4321 x = Py_None;
4322 Py_INCREF(x);
4323 return x;
4324 } else
4325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004327 else if (x == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004328 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004330 long value = PyInt_AS_LONG(x);
4331 if (value < 0 || value > 255) {
4332 PyErr_SetString(PyExc_TypeError,
4333 "character mapping must be in range(256)");
4334 Py_DECREF(x);
4335 return NULL;
4336 }
4337 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004339 else if (PyString_Check(x))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004340 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004342 /* wrong return value */
4343 PyErr_SetString(PyExc_TypeError,
4344 "character mapping must return integer, None or str");
4345 Py_DECREF(x);
4346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
4348}
4349
Martin v. Löwis3f767792006-06-04 19:36:28 +00004350static int
4351charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4352{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004353 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4354 /* exponentially overallocate to minimize reallocations */
4355 if (requiredsize < 2*outsize)
4356 requiredsize = 2*outsize;
4357 if (_PyString_Resize(outobj, requiredsize)) {
4358 return 0;
4359 }
4360 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361}
4362
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004363typedef enum charmapencode_result {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004364 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004365}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366/* lookup the character, put the result in the output string and adjust
4367 various state variables. Reallocate the output string if not enough
4368 space is available. Return a new reference to the object that
4369 was put in the output buffer, or Py_None, if the mapping was undefined
4370 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004371 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004374 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004376 PyObject *rep;
4377 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004378 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379
Christian Heimese93237d2007-12-19 02:37:44 +00004380 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004381 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004382 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004383 if (res == -1)
4384 return enc_FAILED;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004385 if (outsize<requiredsize)
4386 if (!charmapencode_resize(outobj, outpos, requiredsize))
4387 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004388 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004389 outstart[(*outpos)++] = (char)res;
4390 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004391 }
4392
4393 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (rep==NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004395 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396 else if (rep==Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004397 Py_DECREF(rep);
4398 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004400 if (PyInt_Check(rep)) {
4401 Py_ssize_t requiredsize = *outpos+1;
4402 if (outsize<requiredsize)
4403 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4404 Py_DECREF(rep);
4405 return enc_EXCEPTION;
4406 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004407 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004408 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004409 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004410 else {
4411 const char *repchars = PyString_AS_STRING(rep);
4412 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4413 Py_ssize_t requiredsize = *outpos+repsize;
4414 if (outsize<requiredsize)
4415 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4416 Py_DECREF(rep);
4417 return enc_EXCEPTION;
4418 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004419 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004420 memcpy(outstart + *outpos, repchars, repsize);
4421 *outpos += repsize;
4422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 }
Georg Brandl9f167602006-06-04 21:46:16 +00004424 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004425 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426}
4427
4428/* handle an error in PyUnicode_EncodeCharmap
4429 Return 0 on success, -1 on error */
4430static
4431int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004434 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436{
4437 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t repsize;
4439 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 Py_UNICODE *uni2;
4441 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t collstartpos = *inpos;
4443 Py_ssize_t collendpos = *inpos+1;
4444 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 char *encoding = "charmap";
4446 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004447 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 /* find all unencodable characters */
4450 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004451 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004452 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004453 int res = encoding_map_lookup(p[collendpos], mapping);
4454 if (res != -1)
4455 break;
4456 ++collendpos;
4457 continue;
4458 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004459
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004460 rep = charmapencode_lookup(p[collendpos], mapping);
4461 if (rep==NULL)
4462 return -1;
4463 else if (rep!=Py_None) {
4464 Py_DECREF(rep);
4465 break;
4466 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004467 Py_DECREF(rep);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004468 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 }
4470 /* cache callback name lookup
4471 * (if not done yet, i.e. it's the first error) */
4472 if (*known_errorHandler==-1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004473 if ((errors==NULL) || (!strcmp(errors, "strict")))
4474 *known_errorHandler = 1;
4475 else if (!strcmp(errors, "replace"))
4476 *known_errorHandler = 2;
4477 else if (!strcmp(errors, "ignore"))
4478 *known_errorHandler = 3;
4479 else if (!strcmp(errors, "xmlcharrefreplace"))
4480 *known_errorHandler = 4;
4481 else
4482 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 }
4484 switch (*known_errorHandler) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004485 case 1: /* strict */
4486 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4487 return -1;
4488 case 2: /* replace */
4489 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004490 x = charmapencode_output('?', mapping, res, respos);
4491 if (x==enc_EXCEPTION) {
4492 return -1;
4493 }
4494 else if (x==enc_FAILED) {
4495 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4496 return -1;
4497 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004498 }
4499 /* fall through */
4500 case 3: /* ignore */
4501 *inpos = collendpos;
4502 break;
4503 case 4: /* xmlcharrefreplace */
4504 /* generate replacement (temporarily (mis)uses p) */
4505 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004506 char buffer[2+29+1+1];
4507 char *cp;
4508 sprintf(buffer, "&#%d;", (int)p[collpos]);
4509 for (cp = buffer; *cp; ++cp) {
4510 x = charmapencode_output(*cp, mapping, res, respos);
4511 if (x==enc_EXCEPTION)
4512 return -1;
4513 else if (x==enc_FAILED) {
4514 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4515 return -1;
4516 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004517 }
4518 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004519 *inpos = collendpos;
4520 break;
4521 default:
4522 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004523 encoding, reason, p, size, exceptionObject,
4524 collstartpos, collendpos, &newpos);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004525 if (repunicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004526 return -1;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004527 /* generate replacement */
4528 repsize = PyUnicode_GET_SIZE(repunicode);
4529 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004530 x = charmapencode_output(*uni2, mapping, res, respos);
4531 if (x==enc_EXCEPTION) {
4532 return -1;
4533 }
4534 else if (x==enc_FAILED) {
4535 Py_DECREF(repunicode);
4536 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4537 return -1;
4538 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004539 }
4540 *inpos = newpos;
4541 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 }
4543 return 0;
4544}
4545
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004547 Py_ssize_t size,
4548 PyObject *mapping,
4549 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 /* output object */
4552 PyObject *res = NULL;
4553 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 PyObject *errorHandler = NULL;
4558 PyObject *exc = NULL;
4559 /* the following variable is used for caching string comparisons
4560 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4561 * 3=ignore, 4=xmlcharrefreplace */
4562 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563
4564 /* Default to Latin-1 */
4565 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004566 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 /* allocate enough for a simple encoding without
4569 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004570 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (res == NULL)
4572 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004573 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004574 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 while (inpos<size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004577 /* try to encode it */
4578 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4579 if (x==enc_EXCEPTION) /* error */
4580 goto onError;
4581 if (x==enc_FAILED) { /* unencodable character */
4582 if (charmap_encoding_error(p, size, &inpos, mapping,
4583 &exc,
4584 &known_errorHandler, &errorHandler, errors,
4585 &res, &respos)) {
4586 goto onError;
4587 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004588 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004589 else
4590 /* done with this character => adjust input position */
4591 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004595 if (respos<PyString_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004596 if (_PyString_Resize(&res, respos))
4597 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 }
4599 Py_XDECREF(exc);
4600 Py_XDECREF(errorHandler);
4601 return res;
4602
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004603 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_XDECREF(res);
4605 Py_XDECREF(exc);
4606 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 return NULL;
4608}
4609
4610PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004611 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
4613 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004614 PyErr_BadArgument();
4615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 }
4617 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004618 PyUnicode_GET_SIZE(unicode),
4619 mapping,
4620 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621}
4622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623/* create or adjust a UnicodeTranslateError */
4624static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004625 const Py_UNICODE *unicode, Py_ssize_t size,
4626 Py_ssize_t startpos, Py_ssize_t endpos,
4627 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004630 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004631 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 }
4633 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004634 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4635 goto onError;
4636 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4637 goto onError;
4638 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4639 goto onError;
4640 return;
4641 onError:
4642 Py_DECREF(*exceptionObject);
4643 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644 }
4645}
4646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647/* raises a UnicodeTranslateError */
4648static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004649 const Py_UNICODE *unicode, Py_ssize_t size,
4650 Py_ssize_t startpos, Py_ssize_t endpos,
4651 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652{
4653 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004654 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004656 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657}
4658
4659/* error handling callback helper:
4660 build arguments, call the callback and check the arguments,
4661 put the result into newpos and return the replacement string, which
4662 has to be freed by the caller */
4663static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004664 PyObject **errorHandler,
4665 const char *reason,
4666 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4667 Py_ssize_t startpos, Py_ssize_t endpos,
4668 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004670 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671
Martin v. Löwis412fb672006-04-13 06:34:32 +00004672 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 PyObject *restuple;
4674 PyObject *resunicode;
4675
4676 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004677 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 }
4681
4682 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004683 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686
4687 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004688 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00004692 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004693 Py_DECREF(restuple);
4694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 }
4696 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004697 &resunicode, &i_newpos)) {
4698 Py_DECREF(restuple);
4699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 if (i_newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004702 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004703 else
4704 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004705 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004706 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4707 Py_DECREF(restuple);
4708 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_INCREF(resunicode);
4711 Py_DECREF(restuple);
4712 return resunicode;
4713}
4714
4715/* Lookup the character ch in the mapping and put the result in result,
4716 which must be decrefed by the caller.
4717 Return 0 on success, -1 on error */
4718static
4719int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4720{
4721 PyObject *w = PyInt_FromLong((long)c);
4722 PyObject *x;
4723
4724 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004725 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 x = PyObject_GetItem(mapping, w);
4727 Py_DECREF(w);
4728 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004729 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4730 /* No mapping found means: use 1:1 mapping. */
4731 PyErr_Clear();
4732 *result = NULL;
4733 return 0;
4734 } else
4735 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 }
4737 else if (x == Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004738 *result = x;
4739 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 }
4741 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004742 long value = PyInt_AS_LONG(x);
4743 long max = PyUnicode_GetMax();
4744 if (value < 0 || value > max) {
4745 PyErr_Format(PyExc_TypeError,
4746 "character mapping must be in range(0x%lx)", max+1);
4747 Py_DECREF(x);
4748 return -1;
4749 }
4750 *result = x;
4751 return 0;
4752 }
4753 else if (PyUnicode_Check(x)) {
4754 *result = x;
4755 return 0;
4756 }
4757 else {
4758 /* wrong return value */
4759 PyErr_SetString(PyExc_TypeError,
4760 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004761 Py_DECREF(x);
4762 return -1;
4763 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764}
4765/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004766 if not reallocate and adjust various state variables.
4767 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768static
Walter Dörwald4894c302003-10-24 14:25:28 +00004769int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004770 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004772 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004773 if (requiredsize > oldsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004774 /* remember old output position */
4775 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4776 /* exponentially overallocate to minimize reallocations */
4777 if (requiredsize < 2 * oldsize)
4778 requiredsize = 2 * oldsize;
4779 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4780 return -1;
4781 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 }
4783 return 0;
4784}
4785/* lookup the character, put the result in the output string and adjust
4786 various state variables. Return a new reference to the object that
4787 was put in the output buffer in *result, or Py_None, if the mapping was
4788 undefined (in which case no character was written).
4789 The called must decref result.
4790 Return 0 on success, -1 on error. */
4791static
Walter Dörwald4894c302003-10-24 14:25:28 +00004792int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004793 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4794 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795{
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004797 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 if (*res==NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004799 /* not found => default to 1:1 mapping */
4800 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 }
4802 else if (*res==Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004803 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 else if (PyInt_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004805 /* no overflow check, because we know that the space is enough */
4806 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 }
4808 else if (PyUnicode_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004809 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4810 if (repsize==1) {
4811 /* no overflow check, because we know that the space is enough */
4812 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4813 }
4814 else if (repsize!=0) {
4815 /* more than one character */
4816 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4817 (insize - (curinp-startinp)) +
4818 repsize - 1;
4819 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4820 return -1;
4821 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4822 *outp += repsize;
4823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004826 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 return 0;
4828}
4829
4830PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004831 Py_ssize_t size,
4832 PyObject *mapping,
4833 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 /* output object */
4836 PyObject *res = NULL;
4837 /* pointers to the beginning and end+1 of input */
4838 const Py_UNICODE *startp = p;
4839 const Py_UNICODE *endp = p + size;
4840 /* pointer into the output */
4841 Py_UNICODE *str;
4842 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004843 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 char *reason = "character maps to <undefined>";
4845 PyObject *errorHandler = NULL;
4846 PyObject *exc = NULL;
4847 /* the following variable is used for caching string comparisons
4848 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4849 * 3=ignore, 4=xmlcharrefreplace */
4850 int known_errorHandler = -1;
4851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004853 PyErr_BadArgument();
4854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856
4857 /* allocate enough for a simple 1:1 translation without
4858 replacements, if we need more, we'll resize */
4859 res = PyUnicode_FromUnicode(NULL, size);
4860 if (res == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004863 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004867 /* try to encode it */
4868 PyObject *x = NULL;
4869 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4870 Py_XDECREF(x);
4871 goto onError;
4872 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004873 Py_XDECREF(x);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004874 if (x!=Py_None) /* it worked => adjust input pointer */
4875 ++p;
4876 else { /* untranslatable character */
4877 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4878 Py_ssize_t repsize;
4879 Py_ssize_t newpos;
4880 Py_UNICODE *uni2;
4881 /* startpos for collecting untranslatable chars */
4882 const Py_UNICODE *collstart = p;
4883 const Py_UNICODE *collend = p+1;
4884 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004886 /* find all untranslatable characters */
4887 while (collend < endp) {
4888 if (charmaptranslate_lookup(*collend, mapping, &x))
4889 goto onError;
4890 Py_XDECREF(x);
4891 if (x!=Py_None)
4892 break;
4893 ++collend;
4894 }
4895 /* cache callback name lookup
4896 * (if not done yet, i.e. it's the first error) */
4897 if (known_errorHandler==-1) {
4898 if ((errors==NULL) || (!strcmp(errors, "strict")))
4899 known_errorHandler = 1;
4900 else if (!strcmp(errors, "replace"))
4901 known_errorHandler = 2;
4902 else if (!strcmp(errors, "ignore"))
4903 known_errorHandler = 3;
4904 else if (!strcmp(errors, "xmlcharrefreplace"))
4905 known_errorHandler = 4;
4906 else
4907 known_errorHandler = 0;
4908 }
4909 switch (known_errorHandler) {
4910 case 1: /* strict */
4911 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004912 goto onError;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004913 case 2: /* replace */
4914 /* No need to check for space, this is a 1:1 replacement */
4915 for (coll = collstart; coll<collend; ++coll)
4916 *str++ = '?';
4917 /* fall through */
4918 case 3: /* ignore */
4919 p = collend;
4920 break;
4921 case 4: /* xmlcharrefreplace */
4922 /* generate replacement (temporarily (mis)uses p) */
4923 for (p = collstart; p < collend; ++p) {
4924 char buffer[2+29+1+1];
4925 char *cp;
4926 sprintf(buffer, "&#%d;", (int)*p);
4927 if (charmaptranslate_makespace(&res, &str,
4928 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4929 goto onError;
4930 for (cp = buffer; *cp; ++cp)
4931 *str++ = *cp;
4932 }
4933 p = collend;
4934 break;
4935 default:
4936 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4937 reason, startp, size, &exc,
4938 collstart-startp, collend-startp, &newpos);
4939 if (repunicode == NULL)
4940 goto onError;
4941 /* generate replacement */
4942 repsize = PyUnicode_GET_SIZE(repunicode);
4943 if (charmaptranslate_makespace(&res, &str,
4944 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4945 Py_DECREF(repunicode);
4946 goto onError;
4947 }
4948 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4949 *str++ = *uni2;
4950 p = startp + newpos;
4951 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004952 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004953 }
4954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 /* Resize if we allocated to much */
4956 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004957 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004958 if (PyUnicode_Resize(&res, respos) < 0)
4959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 }
4961 Py_XDECREF(exc);
4962 Py_XDECREF(errorHandler);
4963 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004965 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 Py_XDECREF(res);
4967 Py_XDECREF(exc);
4968 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 return NULL;
4970}
4971
4972PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004973 PyObject *mapping,
4974 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975{
4976 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004977
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 str = PyUnicode_FromObject(str);
4979 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004982 PyUnicode_GET_SIZE(str),
4983 mapping,
4984 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 Py_DECREF(str);
4986 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004987
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 Py_XDECREF(str);
4990 return NULL;
4991}
Tim Petersced69f82003-09-16 20:30:58 +00004992
Guido van Rossum9e896b32000-04-05 20:11:21 +00004993/* --- Decimal Encoder ---------------------------------------------------- */
4994
4995int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004996 Py_ssize_t length,
4997 char *output,
4998 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999{
5000 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 PyObject *errorHandler = NULL;
5002 PyObject *exc = NULL;
5003 const char *encoding = "decimal";
5004 const char *reason = "invalid decimal Unicode string";
5005 /* the following variable is used for caching string comparisons
5006 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5007 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005008
5009 if (output == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005010 PyErr_BadArgument();
5011 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 }
5013
5014 p = s;
5015 end = s + length;
5016 while (p < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005017 register Py_UNICODE ch = *p;
5018 int decimal;
5019 PyObject *repunicode;
5020 Py_ssize_t repsize;
5021 Py_ssize_t newpos;
5022 Py_UNICODE *uni2;
5023 Py_UNICODE *collstart;
5024 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005025
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005026 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005027 *output++ = ' ';
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005028 ++p;
5029 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005030 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005031 decimal = Py_UNICODE_TODECIMAL(ch);
5032 if (decimal >= 0) {
5033 *output++ = '0' + decimal;
5034 ++p;
5035 continue;
5036 }
5037 if (0 < ch && ch < 256) {
5038 *output++ = (char)ch;
5039 ++p;
5040 continue;
5041 }
5042 /* All other characters are considered unencodable */
5043 collstart = p;
5044 collend = p+1;
5045 while (collend < end) {
5046 if ((0 < *collend && *collend < 256) ||
5047 !Py_UNICODE_ISSPACE(*collend) ||
5048 Py_UNICODE_TODECIMAL(*collend))
5049 break;
5050 }
5051 /* cache callback name lookup
5052 * (if not done yet, i.e. it's the first error) */
5053 if (known_errorHandler==-1) {
5054 if ((errors==NULL) || (!strcmp(errors, "strict")))
5055 known_errorHandler = 1;
5056 else if (!strcmp(errors, "replace"))
5057 known_errorHandler = 2;
5058 else if (!strcmp(errors, "ignore"))
5059 known_errorHandler = 3;
5060 else if (!strcmp(errors, "xmlcharrefreplace"))
5061 known_errorHandler = 4;
5062 else
5063 known_errorHandler = 0;
5064 }
5065 switch (known_errorHandler) {
5066 case 1: /* strict */
5067 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5068 goto onError;
5069 case 2: /* replace */
5070 for (p = collstart; p < collend; ++p)
5071 *output++ = '?';
5072 /* fall through */
5073 case 3: /* ignore */
5074 p = collend;
5075 break;
5076 case 4: /* xmlcharrefreplace */
5077 /* generate replacement (temporarily (mis)uses p) */
5078 for (p = collstart; p < collend; ++p)
5079 output += sprintf(output, "&#%d;", (int)*p);
5080 p = collend;
5081 break;
5082 default:
5083 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5084 encoding, reason, s, length, &exc,
5085 collstart-s, collend-s, &newpos);
5086 if (repunicode == NULL)
5087 goto onError;
5088 /* generate replacement */
5089 repsize = PyUnicode_GET_SIZE(repunicode);
5090 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5091 Py_UNICODE ch = *uni2;
5092 if (Py_UNICODE_ISSPACE(ch))
5093 *output++ = ' ';
5094 else {
5095 decimal = Py_UNICODE_TODECIMAL(ch);
5096 if (decimal >= 0)
5097 *output++ = '0' + decimal;
5098 else if (0 < ch && ch < 256)
5099 *output++ = (char)ch;
5100 else {
5101 Py_DECREF(repunicode);
5102 raise_encode_exception(&exc, encoding,
5103 s, length, collstart-s, collend-s, reason);
5104 goto onError;
5105 }
5106 }
5107 }
5108 p = s + newpos;
5109 Py_DECREF(repunicode);
5110 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005111 }
5112 /* 0-terminate the output string */
5113 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 Py_XDECREF(exc);
5115 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005116 return 0;
5117
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005118 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 Py_XDECREF(exc);
5120 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121 return -1;
5122}
5123
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124/* --- Helpers ------------------------------------------------------------ */
5125
Eric Smitha9f7d622008-02-17 19:46:49 +00005126#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005127
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005128#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005129
Fredrik Lundha50d2012006-05-26 17:04:58 +00005130#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005131
5132#include "stringlib/count.h"
5133#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005134#include "stringlib/partition.h"
5135
Fredrik Lundhc8162812006-05-26 19:33:03 +00005136/* helper macro to fixup start/end slice values */
5137#define FIX_START_END(obj) \
5138 if (start < 0) \
5139 start += (obj)->length; \
5140 if (start < 0) \
5141 start = 0; \
5142 if (end > (obj)->length) \
5143 end = (obj)->length; \
5144 if (end < 0) \
5145 end += (obj)->length; \
5146 if (end < 0) \
5147 end = 0;
5148
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005150 PyObject *substr,
5151 Py_ssize_t start,
5152 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005154 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005155 PyUnicodeObject* str_obj;
5156 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005157
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005158 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5159 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005160 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005161 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5162 if (!sub_obj) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005163 Py_DECREF(str_obj);
5164 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 }
Tim Petersced69f82003-09-16 20:30:58 +00005166
Fredrik Lundhc8162812006-05-26 19:33:03 +00005167 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005168
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005169 result = stringlib_count(
5170 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5171 );
5172
5173 Py_DECREF(sub_obj);
5174 Py_DECREF(str_obj);
5175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 return result;
5177}
5178
Martin v. Löwis18e16552006-02-15 17:27:45 +00005179Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005180 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005181 Py_ssize_t start,
5182 Py_ssize_t end,
5183 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005185 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005187 str = PyUnicode_FromObject(str);
5188 if (!str)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005189 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005190 sub = PyUnicode_FromObject(sub);
5191 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005192 Py_DECREF(str);
5193 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 }
Tim Petersced69f82003-09-16 20:30:58 +00005195
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005196 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005197 result = stringlib_find_slice(
5198 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5199 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5200 start, end
5201 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005202 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005203 result = stringlib_rfind_slice(
5204 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5205 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5206 start, end
5207 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005208
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005209 Py_DECREF(str);
5210 Py_DECREF(sub);
5211
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 return result;
5213}
5214
Tim Petersced69f82003-09-16 20:30:58 +00005215static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216int tailmatch(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005217 PyUnicodeObject *substring,
5218 Py_ssize_t start,
5219 Py_ssize_t end,
5220 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (substring->length == 0)
5223 return 1;
5224
Fredrik Lundhc8162812006-05-26 19:33:03 +00005225 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
5227 end -= substring->length;
5228 if (end < start)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005229 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
5231 if (direction > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005232 if (Py_UNICODE_MATCH(self, end, substring))
5233 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 } else {
5235 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005236 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 }
5238
5239 return 0;
5240}
5241
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005243 PyObject *substr,
5244 Py_ssize_t start,
5245 Py_ssize_t end,
5246 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 str = PyUnicode_FromObject(str);
5251 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005252 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 substr = PyUnicode_FromObject(substr);
5254 if (substr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005255 Py_DECREF(str);
5256 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 }
Tim Petersced69f82003-09-16 20:30:58 +00005258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005260 (PyUnicodeObject *)substr,
5261 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 Py_DECREF(str);
5263 Py_DECREF(substr);
5264 return result;
5265}
5266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267/* Apply fixfct filter to the Unicode object self and return a
5268 reference to the modified object */
5269
Tim Petersced69f82003-09-16 20:30:58 +00005270static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005272 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
5274
5275 PyUnicodeObject *u;
5276
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005277 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005279 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005280
5281 Py_UNICODE_COPY(u->str, self->str, self->length);
5282
Tim Peters7a29bd52001-09-12 03:03:31 +00005283 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005284 /* fixfct should return TRUE if it modified the buffer. If
5285 FALSE, return a reference to the original buffer instead
5286 (to save space, not time) */
5287 Py_INCREF(self);
5288 Py_DECREF(u);
5289 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
5291 return (PyObject*) u;
5292}
5293
Tim Petersced69f82003-09-16 20:30:58 +00005294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295int fixupper(PyUnicodeObject *self)
5296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 Py_UNICODE *s = self->str;
5299 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005302 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005303
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005304 ch = Py_UNICODE_TOUPPER(*s);
5305 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005307 *s = ch;
5308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 s++;
5310 }
5311
5312 return status;
5313}
5314
Tim Petersced69f82003-09-16 20:30:58 +00005315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316int fixlower(PyUnicodeObject *self)
5317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 Py_UNICODE *s = self->str;
5320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005323 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005324
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005325 ch = Py_UNICODE_TOLOWER(*s);
5326 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005328 *s = ch;
5329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 s++;
5331 }
5332
5333 return status;
5334}
5335
Tim Petersced69f82003-09-16 20:30:58 +00005336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337int fixswapcase(PyUnicodeObject *self)
5338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 Py_UNICODE *s = self->str;
5341 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 while (len-- > 0) {
5344 if (Py_UNICODE_ISUPPER(*s)) {
5345 *s = Py_UNICODE_TOLOWER(*s);
5346 status = 1;
5347 } else if (Py_UNICODE_ISLOWER(*s)) {
5348 *s = Py_UNICODE_TOUPPER(*s);
5349 status = 1;
5350 }
5351 s++;
5352 }
5353
5354 return status;
5355}
5356
Tim Petersced69f82003-09-16 20:30:58 +00005357static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358int fixcapitalize(PyUnicodeObject *self)
5359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005360 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005361 Py_UNICODE *s = self->str;
5362 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005363
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005364 if (len == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005365 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005366 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005367 *s = Py_UNICODE_TOUPPER(*s);
5368 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005370 s++;
5371 while (--len > 0) {
5372 if (Py_UNICODE_ISUPPER(*s)) {
5373 *s = Py_UNICODE_TOLOWER(*s);
5374 status = 1;
5375 }
5376 s++;
5377 }
5378 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379}
5380
5381static
5382int fixtitle(PyUnicodeObject *self)
5383{
5384 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register Py_UNICODE *e;
5386 int previous_is_cased;
5387
5388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005390 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5391 if (*p != ch) {
5392 *p = ch;
5393 return 1;
5394 }
5395 else
5396 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
Tim Petersced69f82003-09-16 20:30:58 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 e = p + PyUnicode_GET_SIZE(self);
5400 previous_is_cased = 0;
5401 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005402 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005403
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005404 if (previous_is_cased)
5405 *p = Py_UNICODE_TOLOWER(ch);
5406 else
5407 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005408
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005409 if (Py_UNICODE_ISLOWER(ch) ||
5410 Py_UNICODE_ISUPPER(ch) ||
5411 Py_UNICODE_ISTITLE(ch))
5412 previous_is_cased = 1;
5413 else
5414 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
5416 return 1;
5417}
5418
Tim Peters8ce9f162004-08-27 01:49:32 +00005419PyObject *
5420PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Tim Peters8ce9f162004-08-27 01:49:32 +00005422 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005423 const Py_UNICODE blank = ' ';
5424 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005425 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005426 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005427 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5428 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5430 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005431 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005432 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005433 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 fseq = PySequence_Fast(seq, "");
5436 if (fseq == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005437 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 }
5439
Tim Peters91879ab2004-08-27 22:35:44 +00005440 /* Grrrr. A codec may be invoked to convert str objects to
5441 * Unicode, and so it's possible to call back into Python code
5442 * during PyUnicode_FromObject(), and so it's possible for a sick
5443 * codec to change the size of fseq (if seq is a list). Therefore
5444 * we have to keep refetching the size -- can't assume seqlen
5445 * is invariant.
5446 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005447 seqlen = PySequence_Fast_GET_SIZE(fseq);
5448 /* If empty sequence, return u"". */
5449 if (seqlen == 0) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005450 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5451 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 }
5453 /* If singleton sequence with an exact Unicode, return that. */
5454 if (seqlen == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005455 item = PySequence_Fast_GET_ITEM(fseq, 0);
5456 if (PyUnicode_CheckExact(item)) {
5457 Py_INCREF(item);
5458 res = (PyUnicodeObject *)item;
5459 goto Done;
5460 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005461 }
5462
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 /* At least two items to join, or one that isn't exact Unicode. */
5464 if (seqlen > 1) {
5465 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005466 if (separator == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005467 sep = &blank;
5468 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005469 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005470 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005471 internal_separator = PyUnicode_FromObject(separator);
5472 if (internal_separator == NULL)
5473 goto onError;
5474 sep = PyUnicode_AS_UNICODE(internal_separator);
5475 seplen = PyUnicode_GET_SIZE(internal_separator);
5476 /* In case PyUnicode_FromObject() mutated seq. */
5477 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 }
5479 }
5480
5481 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005482 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005485 res_p = PyUnicode_AS_UNICODE(res);
5486 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005487
Tim Peters05eba1f2004-08-27 21:32:02 +00005488 for (i = 0; i < seqlen; ++i) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005489 Py_ssize_t itemlen;
5490 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005491
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005492 item = PySequence_Fast_GET_ITEM(fseq, i);
5493 /* Convert item to Unicode. */
5494 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5495 PyErr_Format(PyExc_TypeError,
5496 "sequence item %zd: expected string or Unicode,"
5497 " %.80s found",
5498 i, Py_TYPE(item)->tp_name);
5499 goto onError;
5500 }
5501 item = PyUnicode_FromObject(item);
5502 if (item == NULL)
5503 goto onError;
5504 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005505
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005506 /* In case PyUnicode_FromObject() mutated seq. */
5507 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005508
Tim Peters8ce9f162004-08-27 01:49:32 +00005509 /* Make sure we have enough space for the separator and the item. */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005510 itemlen = PyUnicode_GET_SIZE(item);
5511 new_res_used = res_used + itemlen;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005512 if (new_res_used < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005513 goto Overflow;
5514 if (i < seqlen - 1) {
5515 new_res_used += seplen;
5516 if (new_res_used < 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005517 goto Overflow;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005518 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005519 if (new_res_used > res_alloc) {
5520 /* double allocated size until it's big enough */
5521 do {
5522 res_alloc += res_alloc;
5523 if (res_alloc <= 0)
5524 goto Overflow;
5525 } while (new_res_used > res_alloc);
5526 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5527 Py_DECREF(item);
5528 goto onError;
5529 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005531 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005532
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005533 /* Copy item, and maybe the separator. */
5534 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5535 res_p += itemlen;
5536 if (i < seqlen - 1) {
5537 Py_UNICODE_COPY(res_p, sep, seplen);
5538 res_p += seplen;
5539 }
5540 Py_DECREF(item);
5541 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005542 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005543
Tim Peters05eba1f2004-08-27 21:32:02 +00005544 /* Shrink res to match the used area; this probably can't fail,
5545 * but it's cheap to check.
5546 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005547 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005548 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005549
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005550 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005551 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005552 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 return (PyObject *)res;
5554
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005555 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005557 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 Py_DECREF(item);
5559 /* fall through */
5560
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005561 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005564 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 return NULL;
5566}
5567
Tim Petersced69f82003-09-16 20:30:58 +00005568static
5569PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005570 Py_ssize_t left,
5571 Py_ssize_t right,
5572 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573{
5574 PyUnicodeObject *u;
5575
5576 if (left < 0)
5577 left = 0;
5578 if (right < 0)
5579 right = 0;
5580
Tim Peters7a29bd52001-09-12 03:03:31 +00005581 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 Py_INCREF(self);
5583 return self;
5584 }
5585
Neal Norwitze7d8be82008-07-31 17:17:14 +00005586 if (left > PY_SSIZE_T_MAX - self->length ||
5587 right > PY_SSIZE_T_MAX - (left + self->length)) {
5588 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5589 return NULL;
5590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 u = _PyUnicode_New(left + self->length + right);
5592 if (u) {
5593 if (left)
5594 Py_UNICODE_FILL(u->str, fill, left);
5595 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5596 if (right)
5597 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5598 }
5599
5600 return u;
5601}
5602
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005603#define SPLIT_APPEND(data, left, right) \
5604 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5605 if (!str) \
5606 goto onError; \
5607 if (PyList_Append(list, str)) { \
5608 Py_DECREF(str); \
5609 goto onError; \
5610 } \
5611 else \
5612 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614static
5615PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005616 PyObject *list,
5617 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619 register Py_ssize_t i;
5620 register Py_ssize_t j;
5621 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005623 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
5625 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005626 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005627 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005628 i++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005629 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005630 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5631 i++;
5632 if (j < i) {
5633 if (maxcount-- <= 0)
5634 break;
5635 SPLIT_APPEND(buf, j, i);
5636 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5637 i++;
5638 j = i;
5639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005642 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 }
5644 return list;
5645
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005646 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 Py_DECREF(list);
5648 return NULL;
5649}
5650
5651PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005652 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654 register Py_ssize_t i;
5655 register Py_ssize_t j;
5656 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 PyObject *list;
5658 PyObject *str;
5659 Py_UNICODE *data;
5660
5661 string = PyUnicode_FromObject(string);
5662 if (string == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 data = PyUnicode_AS_UNICODE(string);
5665 len = PyUnicode_GET_SIZE(string);
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 list = PyList_New(0);
5668 if (!list)
5669 goto onError;
5670
5671 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005672 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005673
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005674 /* Find a line and append it */
5675 while (i < len && !BLOOM_LINEBREAK(data[i]))
5676 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005678 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005679 eol = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005680 if (i < len) {
5681 if (data[i] == '\r' && i + 1 < len &&
5682 data[i+1] == '\n')
5683 i += 2;
5684 else
5685 i++;
5686 if (keepends)
5687 eol = i;
5688 }
5689 SPLIT_APPEND(data, j, eol);
5690 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005693 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 }
5695
5696 Py_DECREF(string);
5697 return list;
5698
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005699 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005700 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 Py_DECREF(string);
5702 return NULL;
5703}
5704
Tim Petersced69f82003-09-16 20:30:58 +00005705static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005707 PyObject *list,
5708 Py_UNICODE ch,
5709 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 register Py_ssize_t i;
5712 register Py_ssize_t j;
5713 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005715 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
5717 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005718 if (buf[i] == ch) {
5719 if (maxcount-- <= 0)
5720 break;
5721 SPLIT_APPEND(buf, j, i);
5722 i = j = i + 1;
5723 } else
5724 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 }
5726 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005727 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 }
5729 return list;
5730
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005731 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 Py_DECREF(list);
5733 return NULL;
5734}
5735
Tim Petersced69f82003-09-16 20:30:58 +00005736static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005738 PyObject *list,
5739 PyUnicodeObject *substring,
5740 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005742 register Py_ssize_t i;
5743 register Py_ssize_t j;
5744 Py_ssize_t len = self->length;
5745 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 PyObject *str;
5747
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005748 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005749 if (Py_UNICODE_MATCH(self, i, substring)) {
5750 if (maxcount-- <= 0)
5751 break;
5752 SPLIT_APPEND(self->str, j, i);
5753 i = j = i + sublen;
5754 } else
5755 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 }
5757 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005758 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
5760 return list;
5761
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005762 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 Py_DECREF(list);
5764 return NULL;
5765}
5766
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767static
5768PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005769 PyObject *list,
5770 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 register Py_ssize_t i;
5773 register Py_ssize_t j;
5774 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005776 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777
5778 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005779 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005780 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005781 i--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005782 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005783 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5784 i--;
5785 if (j > i) {
5786 if (maxcount-- <= 0)
5787 break;
5788 SPLIT_APPEND(buf, i + 1, j + 1);
5789 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5790 i--;
5791 j = i;
5792 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 }
5794 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005795 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005797 if (PyList_Reverse(list) < 0)
5798 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 return list;
5800
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005801 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802 Py_DECREF(list);
5803 return NULL;
5804}
5805
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005806static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005808 PyObject *list,
5809 Py_UNICODE ch,
5810 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 register Py_ssize_t i;
5813 register Py_ssize_t j;
5814 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005815 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005816 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817
5818 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005819 if (buf[i] == ch) {
5820 if (maxcount-- <= 0)
5821 break;
5822 SPLIT_APPEND(buf, i + 1, j + 1);
5823 j = i = i - 1;
5824 } else
5825 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005827 if (j >= -1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005828 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005829 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005830 if (PyList_Reverse(list) < 0)
5831 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832 return list;
5833
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005834 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 Py_DECREF(list);
5836 return NULL;
5837}
5838
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005839static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005841 PyObject *list,
5842 PyUnicodeObject *substring,
5843 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845 register Py_ssize_t i;
5846 register Py_ssize_t j;
5847 Py_ssize_t len = self->length;
5848 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849 PyObject *str;
5850
5851 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005852 if (Py_UNICODE_MATCH(self, i, substring)) {
5853 if (maxcount-- <= 0)
5854 break;
5855 SPLIT_APPEND(self->str, i + sublen, j);
5856 j = i;
5857 i -= sublen;
5858 } else
5859 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860 }
5861 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005862 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005864 if (PyList_Reverse(list) < 0)
5865 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866 return list;
5867
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005868 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869 Py_DECREF(list);
5870 return NULL;
5871}
5872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873#undef SPLIT_APPEND
5874
5875static
5876PyObject *split(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005877 PyUnicodeObject *substring,
5878 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879{
5880 PyObject *list;
5881
5882 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005883 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
5885 list = PyList_New(0);
5886 if (!list)
5887 return NULL;
5888
5889 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005890 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005893 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005896 Py_DECREF(list);
5897 PyErr_SetString(PyExc_ValueError, "empty separator");
5898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
5900 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005901 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
Tim Petersced69f82003-09-16 20:30:58 +00005904static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005905PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005906 PyUnicodeObject *substring,
5907 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005908{
5909 PyObject *list;
5910
5911 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005912 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005913
5914 list = PyList_New(0);
5915 if (!list)
5916 return NULL;
5917
5918 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005919 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005920
5921 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005922 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005923
5924 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005925 Py_DECREF(list);
5926 PyErr_SetString(PyExc_ValueError, "empty separator");
5927 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928 }
5929 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005930 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931}
5932
5933static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005935 PyUnicodeObject *str1,
5936 PyUnicodeObject *str2,
5937 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
5939 PyUnicodeObject *u;
5940
5941 if (maxcount < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005942 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Fredrik Lundh347ee272006-05-24 16:35:18 +00005944 if (str1->length == str2->length) {
5945 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005946 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005947 if (str1->length == 1) {
5948 /* replace characters */
5949 Py_UNICODE u1, u2;
5950 if (!findchar(self->str, self->length, str1->str[0]))
5951 goto nothing;
5952 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5953 if (!u)
5954 return NULL;
5955 Py_UNICODE_COPY(u->str, self->str, self->length);
5956 u1 = str1->str[0];
5957 u2 = str2->str[0];
5958 for (i = 0; i < u->length; i++)
5959 if (u->str[i] == u1) {
5960 if (--maxcount < 0)
5961 break;
5962 u->str[i] = u2;
5963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005965 i = fastsearch(
5966 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (i < 0)
5969 goto nothing;
5970 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5971 if (!u)
5972 return NULL;
5973 Py_UNICODE_COPY(u->str, self->str, self->length);
5974 while (i <= self->length - str1->length)
5975 if (Py_UNICODE_MATCH(self, i, str1)) {
5976 if (--maxcount < 0)
5977 break;
5978 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5979 i += str1->length;
5980 } else
5981 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005984
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005985 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005986 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 Py_UNICODE *p;
5988
5989 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005990 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 if (n > maxcount)
5992 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993 if (n == 0)
5994 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005995 /* new_size = self->length + n * (str2->length - str1->length)); */
5996 delta = (str2->length - str1->length);
5997 if (delta == 0) {
5998 new_size = self->length;
5999 } else {
6000 product = n * (str2->length - str1->length);
6001 if ((product / (str2->length - str1->length)) != n) {
6002 PyErr_SetString(PyExc_OverflowError,
6003 "replace string is too long");
6004 return NULL;
6005 }
6006 new_size = self->length + product;
6007 if (new_size < 0) {
6008 PyErr_SetString(PyExc_OverflowError,
6009 "replace string is too long");
6010 return NULL;
6011 }
6012 }
6013 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006014 if (!u)
6015 return NULL;
6016 i = 0;
6017 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006018 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006019 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006020 while (n-- > 0) {
6021 /* look for next match */
6022 j = i;
6023 while (j <= e) {
6024 if (Py_UNICODE_MATCH(self, j, str1))
6025 break;
6026 j++;
6027 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006028 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006029 if (j > e)
6030 break;
6031 /* copy unchanged part [i:j] */
6032 Py_UNICODE_COPY(p, self->str+i, j-i);
6033 p += j - i;
6034 }
6035 /* copy substitution string */
6036 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006037 Py_UNICODE_COPY(p, str2->str, str2->length);
6038 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006039 }
6040 i = j + str1->length;
6041 }
6042 if (i < self->length)
6043 /* copy tail [i:] */
6044 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006045 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006046 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006047 while (n > 0) {
6048 Py_UNICODE_COPY(p, str2->str, str2->length);
6049 p += str2->length;
6050 if (--n <= 0)
6051 break;
6052 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006054 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
6056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006058
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006059 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006060 /* nothing to replace; return original string (when possible) */
6061 if (PyUnicode_CheckExact(self)) {
6062 Py_INCREF(self);
6063 return (PyObject *) self;
6064 }
6065 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
6068/* --- Unicode Object Methods --------------------------------------------- */
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(title__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006071 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072\n\
6073Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return fixup(self, fixtitle);
6080}
6081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006082PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006083 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084\n\
6085Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006086have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
6088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006089unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 return fixup(self, fixcapitalize);
6092}
6093
6094#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006096 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097\n\
6098Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006102unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 PyObject *list;
6105 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006106 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 /* Split into words */
6109 list = split(self, NULL, -1);
6110 if (!list)
6111 return NULL;
6112
6113 /* Capitalize each word */
6114 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6115 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006116 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 if (item == NULL)
6118 goto onError;
6119 Py_DECREF(PyList_GET_ITEM(list, i));
6120 PyList_SET_ITEM(list, i, item);
6121 }
6122
6123 /* Join the words to form a new string */
6124 item = PyUnicode_Join(NULL, list);
6125
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006126 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 Py_DECREF(list);
6128 return (PyObject *)item;
6129}
6130#endif
6131
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006132/* Argument converter. Coerces to a single unicode character */
6133
6134static int
6135convert_uc(PyObject *obj, void *addr)
6136{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006137 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6138 PyObject *uniobj;
6139 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006140
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006141 uniobj = PyUnicode_FromObject(obj);
6142 if (uniobj == NULL) {
6143 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006144 "The fill character cannot be converted to Unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006145 return 0;
6146 }
6147 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6148 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006149 "The fill character must be exactly one character long");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006150 Py_DECREF(uniobj);
6151 return 0;
6152 }
6153 unistr = PyUnicode_AS_UNICODE(uniobj);
6154 *fillcharloc = unistr[0];
6155 Py_DECREF(uniobj);
6156 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006157}
6158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006159PyDoc_STRVAR(center__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006160 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006162Return S centered in a Unicode string of length width. Padding is\n\
6163done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
6165static PyObject *
6166unicode_center(PyUnicodeObject *self, PyObject *args)
6167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 Py_ssize_t marg, left;
6169 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006170 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Thomas Woutersde017742006-02-16 19:34:37 +00006172 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 return NULL;
6174
Tim Peters7a29bd52001-09-12 03:03:31 +00006175 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 Py_INCREF(self);
6177 return (PyObject*) self;
6178 }
6179
6180 marg = width - self->length;
6181 left = marg / 2 + (marg & width & 1);
6182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186#if 0
6187
6188/* This code should go into some future Unicode collation support
6189 module. The basic comparison should compare ordinals on a naive
Georg Brandla3c242c2009-10-27 14:19:50 +00006190 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006192/* speedy UTF-16 code point order comparison */
6193/* gleaned from: */
6194/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6195
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006196static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006197{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006198 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006199 0, 0, 0, 0, 0, 0, 0, 0,
6200 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006201 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006202};
6203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204static int
6205unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006207 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 Py_UNICODE *s1 = str1->str;
6210 Py_UNICODE *s2 = str2->str;
6211
6212 len1 = str1->length;
6213 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006214
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006216 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006217
6218 c1 = *s1++;
6219 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006220
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006221 if (c1 > (1<<11) * 26)
6222 c1 += utf16Fixup[c1>>11];
6223 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006224 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006225 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006226
6227 if (c1 != c2)
6228 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006229
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006230 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
6232
6233 return (len1 < len2) ? -1 : (len1 != len2);
6234}
6235
Marc-André Lemburge5034372000-08-08 08:04:29 +00006236#else
6237
6238static int
6239unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006242
6243 Py_UNICODE *s1 = str1->str;
6244 Py_UNICODE *s2 = str2->str;
6245
6246 len1 = str1->length;
6247 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Marc-André Lemburge5034372000-08-08 08:04:29 +00006249 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006250 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006251
Fredrik Lundh45714e92001-06-26 16:39:36 +00006252 c1 = *s1++;
6253 c2 = *s2++;
6254
6255 if (c1 != c2)
6256 return (c1 < c2) ? -1 : 1;
6257
Marc-André Lemburge5034372000-08-08 08:04:29 +00006258 len1--; len2--;
6259 }
6260
6261 return (len1 < len2) ? -1 : (len1 != len2);
6262}
6263
6264#endif
6265
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266int PyUnicode_Compare(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006267 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
6269 PyUnicodeObject *u = NULL, *v = NULL;
6270 int result;
6271
6272 /* Coerce the two arguments */
6273 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6274 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006275 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6277 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Thomas Wouters7e474022000-07-16 12:04:32 +00006280 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 if (v == u) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006282 Py_DECREF(u);
6283 Py_DECREF(v);
6284 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 }
6286
6287 result = unicode_compare(u, v);
6288
6289 Py_DECREF(u);
6290 Py_DECREF(v);
6291 return result;
6292
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_XDECREF(u);
6295 Py_XDECREF(v);
6296 return -1;
6297}
6298
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006299PyObject *PyUnicode_RichCompare(PyObject *left,
6300 PyObject *right,
6301 int op)
6302{
6303 int result;
6304
6305 result = PyUnicode_Compare(left, right);
6306 if (result == -1 && PyErr_Occurred())
6307 goto onError;
6308
6309 /* Convert the return value to a Boolean */
6310 switch (op) {
6311 case Py_EQ:
6312 result = (result == 0);
6313 break;
6314 case Py_NE:
6315 result = (result != 0);
6316 break;
6317 case Py_LE:
6318 result = (result <= 0);
6319 break;
6320 case Py_GE:
6321 result = (result >= 0);
6322 break;
6323 case Py_LT:
6324 result = (result == -1);
6325 break;
6326 case Py_GT:
6327 result = (result == 1);
6328 break;
6329 }
6330 return PyBool_FromLong(result);
6331
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006332 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006333
6334 /* Standard case
6335
6336 Type errors mean that PyUnicode_FromObject() could not convert
6337 one of the arguments (usually the right hand side) to Unicode,
6338 ie. we can't handle the comparison request. However, it is
6339 possible that the other object knows a comparison method, which
6340 is why we return Py_NotImplemented to give the other object a
6341 chance.
6342
6343 */
6344 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6345 PyErr_Clear();
6346 Py_INCREF(Py_NotImplemented);
6347 return Py_NotImplemented;
6348 }
6349 if (op != Py_EQ && op != Py_NE)
6350 return NULL;
6351
6352 /* Equality comparison.
6353
6354 This is a special case: we silence any PyExc_UnicodeDecodeError
6355 and instead turn it into a PyErr_UnicodeWarning.
6356
6357 */
6358 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6359 return NULL;
6360 PyErr_Clear();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006361 if (PyErr_Warn(PyExc_UnicodeWarning,
6362 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006363 "Unicode equal comparison "
6364 "failed to convert both arguments to Unicode - "
6365 "interpreting them as being unequal" :
6366 "Unicode unequal comparison "
6367 "failed to convert both arguments to Unicode - "
6368 "interpreting them as being unequal"
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006369 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006370 return NULL;
6371 result = (op == Py_NE);
6372 return PyBool_FromLong(result);
6373}
6374
Guido van Rossum403d68b2000-03-13 15:55:09 +00006375int PyUnicode_Contains(PyObject *container,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006376 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006377{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006378 PyObject *str, *sub;
6379 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006380
6381 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006382 sub = PyUnicode_FromObject(element);
6383 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006384 PyErr_SetString(PyExc_TypeError,
6385 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006386 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006387 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006388
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006389 str = PyUnicode_FromObject(container);
6390 if (!str) {
6391 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006392 return -1;
6393 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006394
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006395 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006396
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006397 Py_DECREF(str);
6398 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006399
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006400 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006401}
6402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403/* Concat to string or Unicode object giving a new Unicode object. */
6404
6405PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006406 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
6408 PyUnicodeObject *u = NULL, *v = NULL, *w;
6409
6410 /* Coerce the two arguments */
6411 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6412 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006413 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6415 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
6418 /* Shortcuts */
6419 if (v == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006420 Py_DECREF(v);
6421 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 }
6423 if (u == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006424 Py_DECREF(u);
6425 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
6427
6428 /* Concat the two Unicode strings */
6429 w = _PyUnicode_New(u->length + v->length);
6430 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 Py_UNICODE_COPY(w->str, u->str, u->length);
6433 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6434
6435 Py_DECREF(u);
6436 Py_DECREF(v);
6437 return (PyObject *)w;
6438
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006439 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 Py_XDECREF(u);
6441 Py_XDECREF(v);
6442 return NULL;
6443}
6444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006445PyDoc_STRVAR(count__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006446 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006448Return the number of non-overlapping occurrences of substring sub in\n\
6449Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006450interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
6452static PyObject *
6453unicode_count(PyUnicodeObject *self, PyObject *args)
6454{
6455 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006457 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 PyObject *result;
6459
Guido van Rossumb8872e62000-05-09 14:14:27 +00006460 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006461 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return NULL;
6463
6464 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006465 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006467 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006468
Fredrik Lundhc8162812006-05-26 19:33:03 +00006469 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006471 result = PyInt_FromSsize_t(
6472 stringlib_count(self->str + start, end - start,
6473 substring->str, substring->length)
6474 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475
6476 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006477
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return result;
6479}
6480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481PyDoc_STRVAR(encode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006482 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484Encodes S using the codec registered for encoding. encoding defaults\n\
6485to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006486handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6488'xmlcharrefreplace' as well as any other name registered with\n\
6489codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
6491static PyObject *
6492unicode_encode(PyUnicodeObject *self, PyObject *args)
6493{
6494 char *encoding = NULL;
6495 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006496 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006497
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6499 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006500 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006501 if (v == NULL)
6502 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006503 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006504 PyErr_Format(PyExc_TypeError,
6505 "encoder did not return a string/unicode object "
6506 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006507 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006508 Py_DECREF(v);
6509 return NULL;
6510 }
6511 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006512
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006513 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006514 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515}
6516
6517PyDoc_STRVAR(decode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006518 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519\n\
6520Decodes S using the codec registered for encoding. encoding defaults\n\
6521to the default encoding. errors may be given to set a different error\n\
6522handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6523a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6524as well as any other name registerd with codecs.register_error that is\n\
6525able to handle UnicodeDecodeErrors.");
6526
6527static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006528unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006529{
6530 char *encoding = NULL;
6531 char *errors = NULL;
6532 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006533
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006534 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6535 return NULL;
6536 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006537 if (v == NULL)
6538 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006539 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006540 PyErr_Format(PyExc_TypeError,
6541 "decoder did not return a string/unicode object "
6542 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006543 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006544 Py_DECREF(v);
6545 return NULL;
6546 }
6547 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006548
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006549 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006553PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006554 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555\n\
6556Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject*
6560unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6561{
6562 Py_UNICODE *e;
6563 Py_UNICODE *p;
6564 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006565 Py_UNICODE *qe;
6566 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 PyUnicodeObject *u;
6568 int tabsize = 8;
6569
6570 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Thomas Wouters7e474022000-07-16 12:04:32 +00006573 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006574 i = 0; /* chars up to and including most recent \n or \r */
6575 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6576 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 for (p = self->str; p < e; p++)
6578 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006579 if (tabsize > 0) {
6580 incr = tabsize - (j % tabsize); /* cannot overflow */
6581 if (j > PY_SSIZE_T_MAX - incr)
6582 goto overflow1;
6583 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006584 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006587 if (j > PY_SSIZE_T_MAX - 1)
6588 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 j++;
6590 if (*p == '\n' || *p == '\r') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006591 if (i > PY_SSIZE_T_MAX - j)
6592 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006594 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 }
6596 }
6597
Guido van Rossum5bdff602008-03-11 21:18:06 +00006598 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006599 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 /* Second pass: create output string and fill it */
6602 u = _PyUnicode_New(i + j);
6603 if (!u)
6604 return NULL;
6605
Guido van Rossum5bdff602008-03-11 21:18:06 +00006606 j = 0; /* same as in first pass */
6607 q = u->str; /* next output char */
6608 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610 for (p = self->str; p < e; p++)
6611 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006612 if (tabsize > 0) {
6613 i = tabsize - (j % tabsize);
6614 j += i;
6615 while (i--) {
6616 if (q >= qe)
6617 goto overflow2;
6618 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006619 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006620 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006621 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006622 else {
6623 if (q >= qe)
6624 goto overflow2;
6625 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006626 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 if (*p == '\n' || *p == '\r')
6628 j = 0;
6629 }
6630
6631 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006632
6633 overflow2:
6634 Py_DECREF(u);
6635 overflow1:
6636 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638}
6639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640PyDoc_STRVAR(find__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006641 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642\n\
6643Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006644such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645arguments start and end are interpreted as in slice notation.\n\
6646\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
6649static PyObject *
6650unicode_find(PyUnicodeObject *self, PyObject *args)
6651{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006652 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006653 Py_ssize_t start;
6654 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006655 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
Facundo Batista57d56692007-11-16 18:04:14 +00006657 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006660 result = stringlib_find_slice(
6661 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6662 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6663 start, end
6664 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
6666 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006667
6668 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669}
6670
6671static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
6674 if (index < 0 || index >= self->length) {
6675 PyErr_SetString(PyExc_IndexError, "string index out of range");
6676 return NULL;
6677 }
6678
6679 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6680}
6681
6682static long
6683unicode_hash(PyUnicodeObject *self)
6684{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006685 /* Since Unicode objects compare equal to their ASCII string
6686 counterparts, they should use the individual character values
6687 as basis for their hash value. This is needed to assure that
6688 strings and Unicode objects behave in the same way as
6689 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006692 register Py_UNICODE *p;
6693 register long x;
6694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (self->hash != -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006696 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006697 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006698 /*
6699 We make the hash of the empty string be 0, rather than using
6700 (prefix ^ suffix), since this slightly obfuscates the hash secret
6701 */
6702 if (len == 0) {
6703 self->hash = 0;
6704 return 0;
6705 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006706 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006707 x = _Py_HashSecret.prefix;
6708 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006709 while (--len >= 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006710 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006711 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006712 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006713 if (x == -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006714 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006715 self->hash = x;
6716 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(index__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006720 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724static PyObject *
6725unicode_index(PyUnicodeObject *self, PyObject *args)
6726{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006728 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006729 Py_ssize_t start;
6730 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
Facundo Batista57d56692007-11-16 18:04:14 +00006732 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006735 result = stringlib_find_slice(
6736 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6737 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6738 start, end
6739 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 if (result < 0) {
6744 PyErr_SetString(PyExc_ValueError, "substring not found");
6745 return NULL;
6746 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006747
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006751PyDoc_STRVAR(islower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006752 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006755at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756
6757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006758unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
6760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6761 register const Py_UNICODE *e;
6762 int cased;
6763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 /* Shortcut for single character strings */
6765 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006766 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006768 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006769 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006770 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006771
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 e = p + PyUnicode_GET_SIZE(self);
6773 cased = 0;
6774 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006775 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006776
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006777 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6778 return PyBool_FromLong(0);
6779 else if (!cased && Py_UNICODE_ISLOWER(ch))
6780 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783}
6784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006785PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006786 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006788Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006789at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
6791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006792unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793{
6794 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6795 register const Py_UNICODE *e;
6796 int cased;
6797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 /* Shortcut for single character strings */
6799 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006800 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006802 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006803 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006804 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006805
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 e = p + PyUnicode_GET_SIZE(self);
6807 cased = 0;
6808 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006809 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006810
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006811 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6812 return PyBool_FromLong(0);
6813 else if (!cased && Py_UNICODE_ISUPPER(ch))
6814 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006820 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006822Return True if S is a titlecased string and there is at least one\n\
6823character in S, i.e. upper- and titlecase characters may only\n\
6824follow uncased characters and lowercase characters only cased ones.\n\
6825Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
6827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006828unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829{
6830 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6831 register const Py_UNICODE *e;
6832 int cased, previous_is_cased;
6833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 /* Shortcut for single character strings */
6835 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006836 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6837 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006840 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006841 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 e = p + PyUnicode_GET_SIZE(self);
6844 cased = 0;
6845 previous_is_cased = 0;
6846 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006847 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006848
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006849 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6850 if (previous_is_cased)
6851 return PyBool_FromLong(0);
6852 previous_is_cased = 1;
6853 cased = 1;
6854 }
6855 else if (Py_UNICODE_ISLOWER(ch)) {
6856 if (!previous_is_cased)
6857 return PyBool_FromLong(0);
6858 previous_is_cased = 1;
6859 cased = 1;
6860 }
6861 else
6862 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006867PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006868 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006870Return True if all characters in S are whitespace\n\
6871and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
6873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006874unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875{
6876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877 register const Py_UNICODE *e;
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006881 Py_UNICODE_ISSPACE(*p))
6882 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006885 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006886 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 e = p + PyUnicode_GET_SIZE(self);
6889 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006890 if (!Py_UNICODE_ISSPACE(*p))
6891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006897 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006899Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006900and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006901
6902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006903unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006904{
6905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6906 register const Py_UNICODE *e;
6907
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908 /* Shortcut for single character strings */
6909 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006910 Py_UNICODE_ISALPHA(*p))
6911 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006912
6913 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006914 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006915 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006916
6917 e = p + PyUnicode_GET_SIZE(self);
6918 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006919 if (!Py_UNICODE_ISALPHA(*p))
6920 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006923}
6924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006926 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006927\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006928Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930
6931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006933{
6934 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6935 register const Py_UNICODE *e;
6936
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006937 /* Shortcut for single character strings */
6938 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006939 Py_UNICODE_ISALNUM(*p))
6940 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006941
6942 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006943 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006944 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006945
6946 e = p + PyUnicode_GET_SIZE(self);
6947 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006948 if (!Py_UNICODE_ISALNUM(*p))
6949 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006950 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006952}
6953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006954PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006955 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006958False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
6960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006961unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962{
6963 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6964 register const Py_UNICODE *e;
6965
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 /* Shortcut for single character strings */
6967 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006968 Py_UNICODE_ISDECIMAL(*p))
6969 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006971 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006972 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006973 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006974
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 e = p + PyUnicode_GET_SIZE(self);
6976 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006977 if (!Py_UNICODE_ISDECIMAL(*p))
6978 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981}
6982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006983PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006984 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006986Return True if all characters in S are digits\n\
6987and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006990unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
6992 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6993 register const Py_UNICODE *e;
6994
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 /* Shortcut for single character strings */
6996 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006997 Py_UNICODE_ISDIGIT(*p))
6998 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007000 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007001 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007003
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 e = p + PyUnicode_GET_SIZE(self);
7005 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007006 if (!Py_UNICODE_ISDIGIT(*p))
7007 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010}
7011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007012PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007013 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007015Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017
7018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007019unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020{
7021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7022 register const Py_UNICODE *e;
7023
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 /* Shortcut for single character strings */
7025 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007026 Py_UNICODE_ISNUMERIC(*p))
7027 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007029 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007030 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007031 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007032
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 e = p + PyUnicode_GET_SIZE(self);
7034 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007035 if (!Py_UNICODE_ISNUMERIC(*p))
7036 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007041PyDoc_STRVAR(join__doc__,
Georg Brandl5d2eb342009-10-27 15:08:27 +00007042 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043\n\
7044Return a string which is the concatenation of the strings in the\n\
Georg Brandl5d2eb342009-10-27 15:08:27 +00007045iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007048unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007050 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051}
7052
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054unicode_length(PyUnicodeObject *self)
7055{
7056 return self->length;
7057}
7058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007059PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007060 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007062Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007063done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
7065static PyObject *
7066unicode_ljust(PyUnicodeObject *self, PyObject *args)
7067{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007068 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007069 Py_UNICODE fillchar = ' ';
7070
Martin v. Löwis412fb672006-04-13 06:34:32 +00007071 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 return NULL;
7073
Tim Peters7a29bd52001-09-12 03:03:31 +00007074 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 Py_INCREF(self);
7076 return (PyObject*) self;
7077 }
7078
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007079 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007082PyDoc_STRVAR(lower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007083 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007085Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
7087static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007088unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 return fixup(self, fixlower);
7091}
7092
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093#define LEFTSTRIP 0
7094#define RIGHTSTRIP 1
7095#define BOTHSTRIP 2
7096
7097/* Arrays indexed by above */
7098static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7099
7100#define STRIPNAME(i) (stripformat[i]+3)
7101
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102/* externally visible for str.strip(unicode) */
7103PyObject *
7104_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7105{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007106 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7107 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7108 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7109 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7110 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007112 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007113
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007114 i = 0;
7115 if (striptype != RIGHTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007116 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7117 i++;
7118 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007119 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007121 j = len;
7122 if (striptype != LEFTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007123 do {
7124 j--;
7125 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7126 j++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007127 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007129 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007130 Py_INCREF(self);
7131 return (PyObject*)self;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007132 }
7133 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007134 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007135}
7136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
7138static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007141 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7142 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007144 i = 0;
7145 if (striptype != RIGHTSTRIP) {
7146 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7147 i++;
7148 }
7149 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007151 j = len;
7152 if (striptype != LEFTSTRIP) {
7153 do {
7154 j--;
7155 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7156 j++;
7157 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007158
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007159 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7160 Py_INCREF(self);
7161 return (PyObject*)self;
7162 }
7163 else
7164 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165}
7166
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167
7168static PyObject *
7169do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7170{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007171 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007172
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007173 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7174 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007175
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007176 if (sep != NULL && sep != Py_None) {
7177 if (PyUnicode_Check(sep))
7178 return _PyUnicode_XStrip(self, striptype, sep);
7179 else if (PyString_Check(sep)) {
7180 PyObject *res;
7181 sep = PyUnicode_FromObject(sep);
7182 if (sep==NULL)
7183 return NULL;
7184 res = _PyUnicode_XStrip(self, striptype, sep);
7185 Py_DECREF(sep);
7186 return res;
7187 }
7188 else {
7189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007190 "%s arg must be None, unicode or str",
7191 STRIPNAME(striptype));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007192 return NULL;
7193 }
7194 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007196 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007197}
7198
7199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007200PyDoc_STRVAR(strip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007201 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007202\n\
7203Return a copy of the string S with leading and trailing\n\
7204whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007205If chars is given and not None, remove characters in chars instead.\n\
7206If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007207
7208static PyObject *
7209unicode_strip(PyUnicodeObject *self, PyObject *args)
7210{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007211 if (PyTuple_GET_SIZE(args) == 0)
7212 return do_strip(self, BOTHSTRIP); /* Common case */
7213 else
7214 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007215}
7216
7217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007218PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007219 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007220\n\
7221Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007222If chars is given and not None, remove characters in chars instead.\n\
7223If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007224
7225static PyObject *
7226unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7227{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007228 if (PyTuple_GET_SIZE(args) == 0)
7229 return do_strip(self, LEFTSTRIP); /* Common case */
7230 else
7231 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007232}
7233
7234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007236 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007237\n\
7238Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007239If chars is given and not None, remove characters in chars instead.\n\
7240If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007241
7242static PyObject *
7243unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7244{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007245 if (PyTuple_GET_SIZE(args) == 0)
7246 return do_strip(self, RIGHTSTRIP); /* Common case */
7247 else
7248 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007249}
7250
7251
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 PyUnicodeObject *u;
7256 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007258 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
7260 if (len < 0)
7261 len = 0;
7262
Tim Peters7a29bd52001-09-12 03:03:31 +00007263 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 /* no repeat, return original string */
7265 Py_INCREF(str);
7266 return (PyObject*) str;
7267 }
Tim Peters8f422462000-09-09 06:13:41 +00007268
7269 /* ensure # of chars needed doesn't overflow int and # of bytes
7270 * needed doesn't overflow size_t
7271 */
7272 nchars = len * str->length;
7273 if (len && nchars / len != str->length) {
7274 PyErr_SetString(PyExc_OverflowError,
7275 "repeated string is too long");
7276 return NULL;
7277 }
7278 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7279 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7280 PyErr_SetString(PyExc_OverflowError,
7281 "repeated string is too long");
7282 return NULL;
7283 }
7284 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 if (!u)
7286 return NULL;
7287
7288 p = u->str;
7289
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007290 if (str->length == 1 && len > 0) {
7291 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007292 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007293 Py_ssize_t done = 0; /* number of characters copied this far */
7294 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007295 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007296 done = str->length;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007297 }
7298 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007299 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007300 Py_UNICODE_COPY(p+done, p, n);
7301 done += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007302 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
7305 return (PyObject*) u;
7306}
7307
7308PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007309 PyObject *subobj,
7310 PyObject *replobj,
7311 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312{
7313 PyObject *self;
7314 PyObject *str1;
7315 PyObject *str2;
7316 PyObject *result;
7317
7318 self = PyUnicode_FromObject(obj);
7319 if (self == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 str1 = PyUnicode_FromObject(subobj);
7322 if (str1 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007323 Py_DECREF(self);
7324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 }
7326 str2 = PyUnicode_FromObject(replobj);
7327 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007328 Py_DECREF(self);
7329 Py_DECREF(str1);
7330 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 }
Tim Petersced69f82003-09-16 20:30:58 +00007332 result = replace((PyUnicodeObject *)self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007333 (PyUnicodeObject *)str1,
7334 (PyUnicodeObject *)str2,
7335 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 Py_DECREF(self);
7337 Py_DECREF(str1);
7338 Py_DECREF(str2);
7339 return result;
7340}
7341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007342PyDoc_STRVAR(replace__doc__,
Ezio Melotti6327bf12010-06-26 18:47:01 +00007343 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344\n\
7345Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007346old replaced by new. If the optional argument count is\n\
7347given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348
7349static PyObject*
7350unicode_replace(PyUnicodeObject *self, PyObject *args)
7351{
7352 PyUnicodeObject *str1;
7353 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007354 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 PyObject *result;
7356
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 return NULL;
7359 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7360 if (str1 == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007363 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007364 Py_DECREF(str1);
7365 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368 result = replace(self, str1, str2, maxcount);
7369
7370 Py_DECREF(str1);
7371 Py_DECREF(str2);
7372 return result;
7373}
7374
7375static
7376PyObject *unicode_repr(PyObject *unicode)
7377{
7378 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007379 PyUnicode_GET_SIZE(unicode),
7380 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381}
7382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007384 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385\n\
7386Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007387such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388arguments start and end are interpreted as in slice notation.\n\
7389\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
7392static PyObject *
7393unicode_rfind(PyUnicodeObject *self, PyObject *args)
7394{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007395 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007396 Py_ssize_t start;
7397 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007398 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
Facundo Batista57d56692007-11-16 18:04:14 +00007400 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007403 result = stringlib_rfind_slice(
7404 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7405 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7406 start, end
7407 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
7409 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007410
7411 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412}
7413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007414PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007415 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007417Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
7419static PyObject *
7420unicode_rindex(PyUnicodeObject *self, PyObject *args)
7421{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007422 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007423 Py_ssize_t start;
7424 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007425 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Facundo Batista57d56692007-11-16 18:04:14 +00007427 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007430 result = stringlib_rfind_slice(
7431 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7432 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7433 start, end
7434 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435
7436 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 if (result < 0) {
7439 PyErr_SetString(PyExc_ValueError, "substring not found");
7440 return NULL;
7441 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007442 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443}
7444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007446 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007448Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007449done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
7451static PyObject *
7452unicode_rjust(PyUnicodeObject *self, PyObject *args)
7453{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007454 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007455 Py_UNICODE fillchar = ' ';
7456
Martin v. Löwis412fb672006-04-13 06:34:32 +00007457 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 return NULL;
7459
Tim Peters7a29bd52001-09-12 03:03:31 +00007460 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 Py_INCREF(self);
7462 return (PyObject*) self;
7463 }
7464
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007465 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466}
7467
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007469unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
7471 /* standard clamping */
7472 if (start < 0)
7473 start = 0;
7474 if (end < 0)
7475 end = 0;
7476 if (end > self->length)
7477 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007478 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 /* full slice, return original string */
7480 Py_INCREF(self);
7481 return (PyObject*) self;
7482 }
7483 if (start > end)
7484 start = end;
7485 /* copy slice */
7486 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007487 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488}
7489
7490PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007491 PyObject *sep,
7492 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493{
7494 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007495
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 s = PyUnicode_FromObject(s);
7497 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007498 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007499 if (sep != NULL) {
7500 sep = PyUnicode_FromObject(sep);
7501 if (sep == NULL) {
7502 Py_DECREF(s);
7503 return NULL;
7504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 }
7506
7507 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7508
7509 Py_DECREF(s);
7510 Py_XDECREF(sep);
7511 return result;
7512}
7513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007514PyDoc_STRVAR(split__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007515 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516\n\
7517Return a list of the words in S, using sep as the\n\
7518delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007519splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007520whitespace string is a separator and empty strings are\n\
7521removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523static PyObject*
7524unicode_split(PyUnicodeObject *self, PyObject *args)
7525{
7526 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
Martin v. Löwis18e16552006-02-15 17:27:45 +00007529 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 return NULL;
7531
7532 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007533 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007535 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007537 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538}
7539
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007540PyObject *
7541PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7542{
7543 PyObject* str_obj;
7544 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007545 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007546
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007547 str_obj = PyUnicode_FromObject(str_in);
7548 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007549 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007550 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007551 if (!sep_obj) {
7552 Py_DECREF(str_obj);
7553 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007554 }
7555
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007556 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007557 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7558 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7559 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007560
Fredrik Lundhb9479482006-05-26 17:22:38 +00007561 Py_DECREF(sep_obj);
7562 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007563
7564 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007565}
7566
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007567
7568PyObject *
7569PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7570{
7571 PyObject* str_obj;
7572 PyObject* sep_obj;
7573 PyObject* out;
7574
7575 str_obj = PyUnicode_FromObject(str_in);
7576 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007577 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007578 sep_obj = PyUnicode_FromObject(sep_in);
7579 if (!sep_obj) {
7580 Py_DECREF(str_obj);
7581 return NULL;
7582 }
7583
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007584 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007585 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7586 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7587 );
7588
7589 Py_DECREF(sep_obj);
7590 Py_DECREF(str_obj);
7591
7592 return out;
7593}
7594
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007595PyDoc_STRVAR(partition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007596 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007597\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007598Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007599the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007600found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007601
7602static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007603unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007604{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007605 return PyUnicode_Partition((PyObject *)self, separator);
7606}
7607
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007608PyDoc_STRVAR(rpartition__doc__,
Ezio Melottidabb5f72010-01-25 11:46:11 +00007609 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007610\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007611Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007612the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007613separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007614
7615static PyObject*
7616unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7617{
7618 return PyUnicode_RPartition((PyObject *)self, separator);
7619}
7620
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007621PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007622 PyObject *sep,
7623 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007624{
7625 PyObject *result;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007626
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007627 s = PyUnicode_FromObject(s);
7628 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007629 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007630 if (sep != NULL) {
7631 sep = PyUnicode_FromObject(sep);
7632 if (sep == NULL) {
7633 Py_DECREF(s);
7634 return NULL;
7635 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007636 }
7637
7638 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7639
7640 Py_DECREF(s);
7641 Py_XDECREF(sep);
7642 return result;
7643}
7644
7645PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007646 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007647\n\
7648Return a list of the words in S, using sep as the\n\
7649delimiter string, starting at the end of the string and\n\
7650working to the front. If maxsplit is given, at most maxsplit\n\
7651splits are done. If sep is not specified, any whitespace string\n\
7652is a separator.");
7653
7654static PyObject*
7655unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7656{
7657 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007658 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007659
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007661 return NULL;
7662
7663 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007664 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007665 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007666 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007667 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007668 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007669}
7670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007671PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007672 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673\n\
7674Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007675Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007676is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677
7678static PyObject*
7679unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7680{
Guido van Rossum86662912000-04-11 15:38:46 +00007681 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682
Guido van Rossum86662912000-04-11 15:38:46 +00007683 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 return NULL;
7685
Guido van Rossum86662912000-04-11 15:38:46 +00007686 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687}
7688
7689static
7690PyObject *unicode_str(PyUnicodeObject *self)
7691{
Fred Drakee4315f52000-05-09 19:53:39 +00007692 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693}
7694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007695PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007696 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697\n\
7698Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007702unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 return fixup(self, fixswapcase);
7705}
7706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707PyDoc_STRVAR(translate__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007708 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709\n\
7710Return a copy of the string S, where all characters have been mapped\n\
7711through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007712Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7713Unmapped characters are left untouched. Characters mapped to None\n\
7714are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715
7716static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007717unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718{
Tim Petersced69f82003-09-16 20:30:58 +00007719 return PyUnicode_TranslateCharmap(self->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007720 self->length,
7721 table,
7722 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723}
7724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725PyDoc_STRVAR(upper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007726 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007728Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729
7730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007731unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 return fixup(self, fixupper);
7734}
7735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007736PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007737 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738\n\
Georg Brandl98064072008-09-09 19:26:00 +00007739Pad a numeric string S with zeros on the left, to fill a field\n\
7740of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
7742static PyObject *
7743unicode_zfill(PyUnicodeObject *self, PyObject *args)
7744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007745 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 PyUnicodeObject *u;
7747
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 Py_ssize_t width;
7749 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 return NULL;
7751
7752 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007753 if (PyUnicode_CheckExact(self)) {
7754 Py_INCREF(self);
7755 return (PyObject*) self;
7756 }
7757 else
7758 return PyUnicode_FromUnicode(
7759 PyUnicode_AS_UNICODE(self),
7760 PyUnicode_GET_SIZE(self)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007761 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 }
7763
7764 fill = width - self->length;
7765
7766 u = pad(self, fill, 0, '0');
7767
Walter Dörwald068325e2002-04-15 13:36:47 +00007768 if (u == NULL)
7769 return NULL;
7770
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 if (u->str[fill] == '+' || u->str[fill] == '-') {
7772 /* move sign to beginning of string */
7773 u->str[0] = u->str[fill];
7774 u->str[fill] = '0';
7775 }
7776
7777 return (PyObject*) u;
7778}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779
7780#if 0
7781static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007782free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007784 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785}
7786#endif
7787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007788PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007789 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007791Return True if S starts with the specified prefix, False otherwise.\n\
7792With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007793With optional end, stop comparing S at that position.\n\
7794prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
7796static PyObject *
7797unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007798 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799{
Georg Brandl24250812006-06-09 18:45:48 +00007800 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007803 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007804 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
Georg Brandl24250812006-06-09 18:45:48 +00007806 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007807 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7808 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007809 if (PyTuple_Check(subobj)) {
7810 Py_ssize_t i;
7811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7812 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007813 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007814 if (substring == NULL)
7815 return NULL;
7816 result = tailmatch(self, substring, start, end, -1);
7817 Py_DECREF(substring);
7818 if (result) {
7819 Py_RETURN_TRUE;
7820 }
7821 }
7822 /* nothing matched */
7823 Py_RETURN_FALSE;
7824 }
7825 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007827 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007828 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007830 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831}
7832
7833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007834PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007835 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007837Return True if S ends with the specified suffix, False otherwise.\n\
7838With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007839With optional end, stop comparing S at that position.\n\
7840suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841
7842static PyObject *
7843unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007844 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845{
Georg Brandl24250812006-06-09 18:45:48 +00007846 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007849 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007850 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851
Georg Brandl24250812006-06-09 18:45:48 +00007852 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007853 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7854 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007855 if (PyTuple_Check(subobj)) {
7856 Py_ssize_t i;
7857 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7858 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007859 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007860 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007861 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007862 result = tailmatch(self, substring, start, end, +1);
7863 Py_DECREF(substring);
7864 if (result) {
7865 Py_RETURN_TRUE;
7866 }
7867 }
7868 Py_RETURN_FALSE;
7869 }
7870 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
Georg Brandl24250812006-06-09 18:45:48 +00007874 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007876 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877}
7878
7879
Eric Smitha9f7d622008-02-17 19:46:49 +00007880/* Implements do_string_format, which is unicode because of stringlib */
7881#include "stringlib/string_format.h"
7882
7883PyDoc_STRVAR(format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007884 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007885\n\
7886");
7887
Eric Smithdc13b792008-05-30 18:10:04 +00007888static PyObject *
7889unicode__format__(PyObject *self, PyObject *args)
7890{
7891 PyObject *format_spec;
7892 PyObject *result = NULL;
7893 PyObject *tmp = NULL;
7894
7895 /* If 2.x, convert format_spec to the same type as value */
7896 /* This is to allow things like u''.format('') */
7897 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7898 goto done;
7899 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7900 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007901 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007902 goto done;
7903 }
7904 tmp = PyObject_Unicode(format_spec);
7905 if (tmp == NULL)
7906 goto done;
7907 format_spec = tmp;
7908
7909 result = _PyUnicode_FormatAdvanced(self,
7910 PyUnicode_AS_UNICODE(format_spec),
7911 PyUnicode_GET_SIZE(format_spec));
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007912 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007913 Py_XDECREF(tmp);
7914 return result;
7915}
7916
Eric Smitha9f7d622008-02-17 19:46:49 +00007917PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007918 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007919\n\
7920");
7921
Robert Schuppenies901c9972008-06-10 10:10:31 +00007922static PyObject *
7923unicode__sizeof__(PyUnicodeObject *v)
7924{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007925 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7926 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007927}
7928
7929PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007930 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007931\n\
7932");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007933
7934static PyObject *
7935unicode_getnewargs(PyUnicodeObject *v)
7936{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007937 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007938}
7939
7940
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941static PyMethodDef unicode_methods[] = {
7942
7943 /* Order is according to common usage: often used methods should
7944 appear first, since lookup is done sequentially. */
7945
Georg Brandlecdc0a92006-03-30 12:19:07 +00007946 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007947 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7948 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007949 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007950 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7951 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7952 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7953 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7954 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7955 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7956 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007957 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007958 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7959 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7960 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007961 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007962 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7964 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7965 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7966 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007967 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007968 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007969 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007970 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007971 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7972 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7973 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7974 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7975 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7976 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7977 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7978 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7979 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7980 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7981 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7982 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7983 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7984 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007985 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007986 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7987 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7988 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7989 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007990 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007991#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007992 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993#endif
7994
7995#if 0
7996 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007997 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998#endif
7999
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008000 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 {NULL, NULL}
8002};
8003
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008004static PyObject *
8005unicode_mod(PyObject *v, PyObject *w)
8006{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008007 if (!PyUnicode_Check(v)) {
8008 Py_INCREF(Py_NotImplemented);
8009 return Py_NotImplemented;
8010 }
8011 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008012}
8013
8014static PyNumberMethods unicode_as_number = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008015 0, /*nb_add*/
8016 0, /*nb_subtract*/
8017 0, /*nb_multiply*/
8018 0, /*nb_divide*/
8019 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008020};
8021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008023 (lenfunc) unicode_length, /* sq_length */
8024 PyUnicode_Concat, /* sq_concat */
8025 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8026 (ssizeargfunc) unicode_getitem, /* sq_item */
8027 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8028 0, /* sq_ass_item */
8029 0, /* sq_ass_slice */
8030 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031};
8032
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008033static PyObject*
8034unicode_subscript(PyUnicodeObject* self, PyObject* item)
8035{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008036 if (PyIndex_Check(item)) {
8037 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008038 if (i == -1 && PyErr_Occurred())
8039 return NULL;
8040 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008041 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008042 return unicode_getitem(self, i);
8043 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008044 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008045 Py_UNICODE* source_buf;
8046 Py_UNICODE* result_buf;
8047 PyObject* result;
8048
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008049 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008050 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051 return NULL;
8052 }
8053
8054 if (slicelength <= 0) {
8055 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008056 } else if (start == 0 && step == 1 && slicelength == self->length &&
8057 PyUnicode_CheckExact(self)) {
8058 Py_INCREF(self);
8059 return (PyObject *)self;
8060 } else if (step == 1) {
8061 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008062 } else {
8063 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008064 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8065 sizeof(Py_UNICODE));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008066
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008067 if (result_buf == NULL)
8068 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008069
8070 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8071 result_buf[i] = source_buf[cur];
8072 }
Tim Petersced69f82003-09-16 20:30:58 +00008073
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008074 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008075 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008076 return result;
8077 }
8078 } else {
8079 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8080 return NULL;
8081 }
8082}
8083
8084static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008085 (lenfunc)unicode_length, /* mp_length */
8086 (binaryfunc)unicode_subscript, /* mp_subscript */
8087 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008088};
8089
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008092 Py_ssize_t index,
8093 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
8095 if (index != 0) {
8096 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008097 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 return -1;
8099 }
8100 *ptr = (void *) self->str;
8101 return PyUnicode_GET_DATA_SIZE(self);
8102}
8103
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104static Py_ssize_t
8105unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008106 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
8108 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008109 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return -1;
8111}
8112
8113static int
8114unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008115 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116{
8117 if (lenp)
8118 *lenp = PyUnicode_GET_DATA_SIZE(self);
8119 return 1;
8120}
8121
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008122static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008124 Py_ssize_t index,
8125 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126{
8127 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 if (index != 0) {
8130 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008131 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 return -1;
8133 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008134 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008136 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008137 *ptr = (void *) PyString_AS_STRING(str);
8138 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139}
8140
8141/* Helpers for PyUnicode_Format() */
8142
8143static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008144getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 if (argidx < arglen) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008148 (*p_argidx)++;
8149 if (arglen < 0)
8150 return args;
8151 else
8152 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 }
8154 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008155 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 return NULL;
8157}
8158
8159#define F_LJUST (1<<0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008160#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161#define F_BLANK (1<<2)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008162#define F_ALT (1<<3)
8163#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
Martin v. Löwis18e16552006-02-15 17:27:45 +00008165static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008166strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008168 register Py_ssize_t i;
8169 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 for (i = len - 1; i >= 0; i--)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008171 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 return len;
8174}
8175
Neal Norwitzfc76d632006-01-10 06:03:13 +00008176static int
8177doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8178{
Tim Peters15231542006-02-16 01:08:01 +00008179 Py_ssize_t result;
8180
Neal Norwitzfc76d632006-01-10 06:03:13 +00008181 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008182 result = strtounicode(buffer, (char *)buffer);
8183 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008184}
8185
8186static int
8187longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8188{
Tim Peters15231542006-02-16 01:08:01 +00008189 Py_ssize_t result;
8190
Neal Norwitzfc76d632006-01-10 06:03:13 +00008191 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008192 result = strtounicode(buffer, (char *)buffer);
8193 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008194}
8195
Guido van Rossum078151d2002-08-11 04:24:12 +00008196/* XXX To save some code duplication, formatfloat/long/int could have been
8197 shared with stringobject.c, converting from 8-bit to Unicode after the
8198 formatting is done. */
8199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200static int
8201formatfloat(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008202 size_t buflen,
8203 int flags,
8204 int prec,
8205 int type,
8206 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008208 /* fmt = '%#.' + `prec` + `type`
8209 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 char fmt[20];
8211 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 x = PyFloat_AsDouble(v);
8214 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008215 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 if (prec < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008217 prec = 6;
Mark Dickinson75be68b2009-08-28 20:57:42 +00008218#if SIZEOF_INT > 4
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008219 /* make sure that the decimal representation of precision really does
8220 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
Mark Dickinson75be68b2009-08-28 20:57:42 +00008221 if (prec > 0x7fffffff) {
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008222 PyErr_SetString(PyExc_OverflowError,
8223 "outrageously large precision "
8224 "for formatted float");
8225 return -1;
8226 }
Mark Dickinson75be68b2009-08-28 20:57:42 +00008227#endif
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008228
Mark Dickinsona30f3492009-03-29 15:06:29 +00008229 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008230 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008231 /* Worst case length calc to ensure no buffer overrun:
8232
8233 'g' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008234 fmt = %#.<prec>g
8235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8236 for any double rep.)
8237 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008238
8239 'f' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008240 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8241 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008242
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008244 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008245
8246 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008247 if (((type == 'g' || type == 'G') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008248 buflen <= (size_t)10 + (size_t)prec) ||
8249 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8250 PyErr_SetString(PyExc_OverflowError,
8251 "formatted float is too long (precision too large?)");
8252 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008253 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008254 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008255 (flags&F_ALT) ? "#" : "",
8256 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008257 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258}
8259
Tim Peters38fd5b62000-09-21 05:43:11 +00008260static PyObject*
8261formatlong(PyObject *val, int flags, int prec, int type)
8262{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008263 char *buf;
8264 int i, len;
8265 PyObject *str; /* temporary string object. */
8266 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008267
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008268 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8269 if (!str)
8270 return NULL;
8271 result = _PyUnicode_New(len);
8272 if (!result) {
8273 Py_DECREF(str);
8274 return NULL;
8275 }
8276 for (i = 0; i < len; i++)
8277 result->str[i] = buf[i];
8278 result->str[len] = 0;
8279 Py_DECREF(str);
8280 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008281}
8282
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283static int
8284formatint(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008285 size_t buflen,
8286 int flags,
8287 int prec,
8288 int type,
8289 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008291 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008292 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8293 * + 1 + 1
8294 * = 24
8295 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008296 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008297 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 long x;
8299
8300 x = PyInt_AsLong(v);
8301 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008302 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008303 if (x < 0 && type == 'u') {
8304 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008305 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8307 sign = "-";
8308 else
8309 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311 prec = 1;
8312
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008313 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8314 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008316 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008317 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008318 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008319 return -1;
8320 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008321
8322 if ((flags & F_ALT) &&
8323 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008324 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008325 * of issues that cause pain:
8326 * - when 0 is being converted, the C standard leaves off
8327 * the '0x' or '0X', which is inconsistent with other
8328 * %#x/%#X conversions and inconsistent with Python's
8329 * hex() function
8330 * - there are platforms that violate the standard and
8331 * convert 0 with the '0x' or '0X'
8332 * (Metrowerks, Compaq Tru64)
8333 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008334 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008335 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008336 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008337 * We can achieve the desired consistency by inserting our
8338 * own '0x' or '0X' prefix, and substituting %x/%X in place
8339 * of %#x/%#X.
8340 *
8341 * Note that this is the same approach as used in
8342 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008343 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008344 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8345 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008346 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008347 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008348 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8349 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008350 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008351 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008352 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008353 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008355 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
8358static int
8359formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008360 size_t buflen,
8361 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
Ezio Melotti85ddea72010-02-25 17:51:33 +00008363 PyObject *unistr;
8364 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008365 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008366 if (PyUnicode_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008367 if (PyUnicode_GET_SIZE(v) != 1)
8368 goto onError;
8369 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008372 else if (PyString_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008373 if (PyString_GET_SIZE(v) != 1)
8374 goto onError;
Ezio Melotti85ddea72010-02-25 17:51:33 +00008375 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8376 with a UnicodeDecodeError if 'char' is not decodable with the
8377 default encoding (usually ASCII, but it might be something else) */
8378 str = PyString_AS_STRING(v);
8379 if ((unsigned char)str[0] > 0x7F) {
8380 /* the char is not ASCII; try to decode the string using the
8381 default encoding and return -1 to let the UnicodeDecodeError
8382 be raised if the string can't be decoded */
8383 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8384 if (unistr == NULL)
8385 return -1;
8386 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8387 Py_DECREF(unistr);
8388 }
8389 else
8390 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392
8393 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008394 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 long x;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008396 x = PyInt_AsLong(v);
8397 if (x == -1 && PyErr_Occurred())
8398 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008399#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008400 if (x < 0 || x > 0x10ffff) {
8401 PyErr_SetString(PyExc_OverflowError,
8402 "%c arg not in range(0x110000) "
8403 "(wide Python build)");
8404 return -1;
8405 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008406#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008407 if (x < 0 || x > 0xffff) {
8408 PyErr_SetString(PyExc_OverflowError,
8409 "%c arg not in range(0x10000) "
8410 "(narrow Python build)");
8411 return -1;
8412 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008413#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008414 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 }
8416 buf[1] = '\0';
8417 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008418
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008419 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008421 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008422 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423}
8424
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008425/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8426
8427 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8428 chars are formatted. XXX This is a magic number. Each formatting
8429 routine does bounds checking to ensure no overflow, but a better
8430 solution may be to malloc a buffer of appropriate size for each
8431 format. For now, the current solution is sufficient.
8432*/
8433#define FORMATBUFLEN (size_t)120
8434
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008436 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437{
8438 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 int args_owned = 0;
8441 PyUnicodeObject *result = NULL;
8442 PyObject *dict = NULL;
8443 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008444
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 if (format == NULL || args == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008446 PyErr_BadInternalCall();
8447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 }
8449 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008450 if (uformat == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 fmt = PyUnicode_AS_UNICODE(uformat);
8453 fmtcnt = PyUnicode_GET_SIZE(uformat);
8454
8455 reslen = rescnt = fmtcnt + 100;
8456 result = _PyUnicode_New(reslen);
8457 if (result == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 res = PyUnicode_AS_UNICODE(result);
8460
8461 if (PyTuple_Check(args)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008462 arglen = PyTuple_Size(args);
8463 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 }
8465 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008466 arglen = -1;
8467 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 }
Christian Heimese93237d2007-12-19 02:37:44 +00008469 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008470 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008471 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472
8473 while (--fmtcnt >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008474 if (*fmt != '%') {
8475 if (--rescnt < 0) {
8476 rescnt = fmtcnt + 100;
8477 reslen += rescnt;
8478 if (_PyUnicode_Resize(&result, reslen) < 0)
8479 goto onError;
8480 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8481 --rescnt;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008482 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008483 *res++ = *fmt++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008484 }
8485 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008486 /* Got a format specifier */
8487 int flags = 0;
8488 Py_ssize_t width = -1;
8489 int prec = -1;
8490 Py_UNICODE c = '\0';
8491 Py_UNICODE fill;
8492 int isnumok;
8493 PyObject *v = NULL;
8494 PyObject *temp = NULL;
8495 Py_UNICODE *pbuf;
8496 Py_UNICODE sign;
8497 Py_ssize_t len;
8498 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8499
8500 fmt++;
8501 if (*fmt == '(') {
8502 Py_UNICODE *keystart;
8503 Py_ssize_t keylen;
8504 PyObject *key;
8505 int pcount = 1;
8506
8507 if (dict == NULL) {
8508 PyErr_SetString(PyExc_TypeError,
8509 "format requires a mapping");
8510 goto onError;
8511 }
8512 ++fmt;
8513 --fmtcnt;
8514 keystart = fmt;
8515 /* Skip over balanced parentheses */
8516 while (pcount > 0 && --fmtcnt >= 0) {
8517 if (*fmt == ')')
8518 --pcount;
8519 else if (*fmt == '(')
8520 ++pcount;
8521 fmt++;
8522 }
8523 keylen = fmt - keystart - 1;
8524 if (fmtcnt < 0 || pcount > 0) {
8525 PyErr_SetString(PyExc_ValueError,
8526 "incomplete format key");
8527 goto onError;
8528 }
8529#if 0
8530 /* keys are converted to strings using UTF-8 and
8531 then looked up since Python uses strings to hold
8532 variables names etc. in its namespaces and we
8533 wouldn't want to break common idioms. */
8534 key = PyUnicode_EncodeUTF8(keystart,
8535 keylen,
8536 NULL);
8537#else
8538 key = PyUnicode_FromUnicode(keystart, keylen);
8539#endif
8540 if (key == NULL)
8541 goto onError;
8542 if (args_owned) {
8543 Py_DECREF(args);
8544 args_owned = 0;
8545 }
8546 args = PyObject_GetItem(dict, key);
8547 Py_DECREF(key);
8548 if (args == NULL) {
8549 goto onError;
8550 }
8551 args_owned = 1;
8552 arglen = -1;
8553 argidx = -2;
8554 }
8555 while (--fmtcnt >= 0) {
8556 switch (c = *fmt++) {
8557 case '-': flags |= F_LJUST; continue;
8558 case '+': flags |= F_SIGN; continue;
8559 case ' ': flags |= F_BLANK; continue;
8560 case '#': flags |= F_ALT; continue;
8561 case '0': flags |= F_ZERO; continue;
8562 }
8563 break;
8564 }
8565 if (c == '*') {
8566 v = getnextarg(args, arglen, &argidx);
8567 if (v == NULL)
8568 goto onError;
8569 if (!PyInt_Check(v)) {
8570 PyErr_SetString(PyExc_TypeError,
8571 "* wants int");
8572 goto onError;
8573 }
8574 width = PyInt_AsLong(v);
8575 if (width < 0) {
8576 flags |= F_LJUST;
8577 width = -width;
8578 }
8579 if (--fmtcnt >= 0)
8580 c = *fmt++;
8581 }
8582 else if (c >= '0' && c <= '9') {
8583 width = c - '0';
8584 while (--fmtcnt >= 0) {
8585 c = *fmt++;
8586 if (c < '0' || c > '9')
8587 break;
8588 if ((width*10) / 10 != width) {
8589 PyErr_SetString(PyExc_ValueError,
8590 "width too big");
8591 goto onError;
8592 }
8593 width = width*10 + (c - '0');
8594 }
8595 }
8596 if (c == '.') {
8597 prec = 0;
8598 if (--fmtcnt >= 0)
8599 c = *fmt++;
8600 if (c == '*') {
8601 v = getnextarg(args, arglen, &argidx);
8602 if (v == NULL)
8603 goto onError;
8604 if (!PyInt_Check(v)) {
8605 PyErr_SetString(PyExc_TypeError,
8606 "* wants int");
8607 goto onError;
8608 }
8609 prec = PyInt_AsLong(v);
8610 if (prec < 0)
8611 prec = 0;
8612 if (--fmtcnt >= 0)
8613 c = *fmt++;
8614 }
8615 else if (c >= '0' && c <= '9') {
8616 prec = c - '0';
8617 while (--fmtcnt >= 0) {
Stefan Krahae7dd8f2010-07-19 18:24:18 +00008618 c = *fmt++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008619 if (c < '0' || c > '9')
8620 break;
8621 if ((prec*10) / 10 != prec) {
8622 PyErr_SetString(PyExc_ValueError,
8623 "prec too big");
8624 goto onError;
8625 }
8626 prec = prec*10 + (c - '0');
8627 }
8628 }
8629 } /* prec */
8630 if (fmtcnt >= 0) {
8631 if (c == 'h' || c == 'l' || c == 'L') {
8632 if (--fmtcnt >= 0)
8633 c = *fmt++;
8634 }
8635 }
8636 if (fmtcnt < 0) {
8637 PyErr_SetString(PyExc_ValueError,
8638 "incomplete format");
8639 goto onError;
8640 }
8641 if (c != '%') {
8642 v = getnextarg(args, arglen, &argidx);
8643 if (v == NULL)
8644 goto onError;
8645 }
8646 sign = 0;
8647 fill = ' ';
8648 switch (c) {
8649
8650 case '%':
8651 pbuf = formatbuf;
8652 /* presume that buffer length is at least 1 */
8653 pbuf[0] = '%';
8654 len = 1;
8655 break;
8656
8657 case 's':
8658 case 'r':
Victor Stinner4fd2ff92010-03-22 12:56:39 +00008659 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008660 temp = v;
8661 Py_INCREF(temp);
8662 }
8663 else {
8664 PyObject *unicode;
8665 if (c == 's')
8666 temp = PyObject_Unicode(v);
8667 else
8668 temp = PyObject_Repr(v);
8669 if (temp == NULL)
8670 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008671 if (PyUnicode_Check(temp))
8672 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008673 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008674 /* convert to string to Unicode */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008675 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8676 PyString_GET_SIZE(temp),
8677 NULL,
8678 "strict");
8679 Py_DECREF(temp);
8680 temp = unicode;
8681 if (temp == NULL)
8682 goto onError;
8683 }
8684 else {
8685 Py_DECREF(temp);
8686 PyErr_SetString(PyExc_TypeError,
8687 "%s argument has non-string str()");
8688 goto onError;
8689 }
8690 }
8691 pbuf = PyUnicode_AS_UNICODE(temp);
8692 len = PyUnicode_GET_SIZE(temp);
8693 if (prec >= 0 && len > prec)
8694 len = prec;
8695 break;
8696
8697 case 'i':
8698 case 'd':
8699 case 'u':
8700 case 'o':
8701 case 'x':
8702 case 'X':
8703 if (c == 'i')
8704 c = 'd';
8705 isnumok = 0;
8706 if (PyNumber_Check(v)) {
8707 PyObject *iobj=NULL;
8708
8709 if (PyInt_Check(v) || (PyLong_Check(v))) {
8710 iobj = v;
8711 Py_INCREF(iobj);
8712 }
8713 else {
8714 iobj = PyNumber_Int(v);
8715 if (iobj==NULL) iobj = PyNumber_Long(v);
8716 }
8717 if (iobj!=NULL) {
8718 if (PyInt_Check(iobj)) {
8719 isnumok = 1;
8720 pbuf = formatbuf;
8721 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8722 flags, prec, c, iobj);
8723 Py_DECREF(iobj);
8724 if (len < 0)
8725 goto onError;
8726 sign = 1;
8727 }
8728 else if (PyLong_Check(iobj)) {
8729 isnumok = 1;
8730 temp = formatlong(iobj, flags, prec, c);
8731 Py_DECREF(iobj);
8732 if (!temp)
8733 goto onError;
8734 pbuf = PyUnicode_AS_UNICODE(temp);
8735 len = PyUnicode_GET_SIZE(temp);
8736 sign = 1;
8737 }
8738 else {
8739 Py_DECREF(iobj);
8740 }
8741 }
8742 }
8743 if (!isnumok) {
8744 PyErr_Format(PyExc_TypeError,
8745 "%%%c format: a number is required, "
8746 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8747 goto onError;
8748 }
8749 if (flags & F_ZERO)
8750 fill = '0';
8751 break;
8752
8753 case 'e':
8754 case 'E':
8755 case 'f':
8756 case 'F':
8757 case 'g':
8758 case 'G':
8759 if (c == 'F')
8760 c = 'f';
8761 pbuf = formatbuf;
8762 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8763 flags, prec, c, v);
8764 if (len < 0)
8765 goto onError;
8766 sign = 1;
8767 if (flags & F_ZERO)
8768 fill = '0';
8769 break;
8770
8771 case 'c':
8772 pbuf = formatbuf;
8773 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8774 if (len < 0)
8775 goto onError;
8776 break;
8777
8778 default:
8779 PyErr_Format(PyExc_ValueError,
8780 "unsupported format character '%c' (0x%x) "
8781 "at index %zd",
8782 (31<=c && c<=126) ? (char)c : '?',
8783 (int)c,
8784 (Py_ssize_t)(fmt - 1 -
8785 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008786 goto onError;
8787 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008788 if (sign) {
8789 if (*pbuf == '-' || *pbuf == '+') {
8790 sign = *pbuf++;
8791 len--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008792 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008793 else if (flags & F_SIGN)
8794 sign = '+';
8795 else if (flags & F_BLANK)
8796 sign = ' ';
8797 else
8798 sign = 0;
8799 }
8800 if (width < len)
8801 width = len;
8802 if (rescnt - (sign != 0) < width) {
8803 reslen -= rescnt;
8804 rescnt = width + fmtcnt + 100;
8805 reslen += rescnt;
8806 if (reslen < 0) {
8807 Py_XDECREF(temp);
8808 PyErr_NoMemory();
8809 goto onError;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008810 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008811 if (_PyUnicode_Resize(&result, reslen) < 0) {
8812 Py_XDECREF(temp);
8813 goto onError;
8814 }
8815 res = PyUnicode_AS_UNICODE(result)
8816 + reslen - rescnt;
8817 }
8818 if (sign) {
8819 if (fill != ' ')
8820 *res++ = sign;
8821 rescnt--;
8822 if (width > len)
8823 width--;
8824 }
8825 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8826 assert(pbuf[0] == '0');
8827 assert(pbuf[1] == c);
8828 if (fill != ' ') {
8829 *res++ = *pbuf++;
8830 *res++ = *pbuf++;
8831 }
8832 rescnt -= 2;
8833 width -= 2;
8834 if (width < 0)
8835 width = 0;
8836 len -= 2;
8837 }
8838 if (width > len && !(flags & F_LJUST)) {
8839 do {
8840 --rescnt;
8841 *res++ = fill;
8842 } while (--width > len);
8843 }
8844 if (fill == ' ') {
8845 if (sign)
8846 *res++ = sign;
8847 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8848 assert(pbuf[0] == '0');
8849 assert(pbuf[1] == c);
8850 *res++ = *pbuf++;
8851 *res++ = *pbuf++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008852 }
8853 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008854 Py_UNICODE_COPY(res, pbuf, len);
8855 res += len;
8856 rescnt -= len;
8857 while (--width >= len) {
8858 --rescnt;
8859 *res++ = ' ';
8860 }
8861 if (dict && (argidx < arglen) && c != '%') {
8862 PyErr_SetString(PyExc_TypeError,
8863 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008864 Py_XDECREF(temp);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008865 goto onError;
8866 }
8867 Py_XDECREF(temp);
8868 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 } /* until end */
8870 if (argidx < arglen && !dict) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008871 PyErr_SetString(PyExc_TypeError,
8872 "not all arguments converted during string formatting");
8873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 }
8875
Thomas Woutersa96affe2006-03-12 00:29:36 +00008876 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008877 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008879 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880 }
8881 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 return (PyObject *)result;
8883
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008884 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 Py_XDECREF(result);
8886 Py_DECREF(uformat);
8887 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008888 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 }
8890 return NULL;
8891}
8892
8893static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008894 (readbufferproc) unicode_buffer_getreadbuf,
8895 (writebufferproc) unicode_buffer_getwritebuf,
8896 (segcountproc) unicode_buffer_getsegcount,
8897 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898};
8899
Jeremy Hylton938ace62002-07-17 16:30:39 +00008900static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008901unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8902
Tim Peters6d6c1a32001-08-02 04:15:00 +00008903static PyObject *
8904unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8905{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008906 PyObject *x = NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008907 static char *kwlist[] = {"string", "encoding", "errors", 0};
8908 char *encoding = NULL;
8909 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008910
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008911 if (type != &PyUnicode_Type)
8912 return unicode_subtype_new(type, args, kwds);
8913 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008914 kwlist, &x, &encoding, &errors))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008915 return NULL;
8916 if (x == NULL)
8917 return (PyObject *)_PyUnicode_New(0);
8918 if (encoding == NULL && errors == NULL)
8919 return PyObject_Unicode(x);
8920 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008921 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008922}
8923
Guido van Rossume023fe02001-08-30 03:12:59 +00008924static PyObject *
8925unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8926{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008927 PyUnicodeObject *tmp, *pnew;
8928 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008929
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008930 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8931 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8932 if (tmp == NULL)
8933 return NULL;
8934 assert(PyUnicode_Check(tmp));
8935 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8936 if (pnew == NULL) {
8937 Py_DECREF(tmp);
8938 return NULL;
8939 }
8940 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8941 if (pnew->str == NULL) {
8942 _Py_ForgetReference((PyObject *)pnew);
8943 PyObject_Del(pnew);
8944 Py_DECREF(tmp);
8945 return PyErr_NoMemory();
8946 }
8947 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8948 pnew->length = n;
8949 pnew->hash = tmp->hash;
8950 Py_DECREF(tmp);
8951 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008952}
8953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008954PyDoc_STRVAR(unicode_doc,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008955 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008956\n\
8957Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008958encoding defaults to the current default string encoding.\n\
8959errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008960
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008962 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008963 "unicode", /* tp_name */
8964 sizeof(PyUnicodeObject), /* tp_size */
8965 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 /* Slots */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008967 (destructor)unicode_dealloc, /* tp_dealloc */
8968 0, /* tp_print */
8969 0, /* tp_getattr */
8970 0, /* tp_setattr */
8971 0, /* tp_compare */
8972 unicode_repr, /* tp_repr */
8973 &unicode_as_number, /* tp_as_number */
8974 &unicode_as_sequence, /* tp_as_sequence */
8975 &unicode_as_mapping, /* tp_as_mapping */
8976 (hashfunc) unicode_hash, /* tp_hash*/
8977 0, /* tp_call*/
8978 (reprfunc) unicode_str, /* tp_str */
8979 PyObject_GenericGetAttr, /* tp_getattro */
8980 0, /* tp_setattro */
8981 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008982 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008983 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008984 unicode_doc, /* tp_doc */
8985 0, /* tp_traverse */
8986 0, /* tp_clear */
8987 PyUnicode_RichCompare, /* tp_richcompare */
8988 0, /* tp_weaklistoffset */
8989 0, /* tp_iter */
8990 0, /* tp_iternext */
8991 unicode_methods, /* tp_methods */
8992 0, /* tp_members */
8993 0, /* tp_getset */
8994 &PyBaseString_Type, /* tp_base */
8995 0, /* tp_dict */
8996 0, /* tp_descr_get */
8997 0, /* tp_descr_set */
8998 0, /* tp_dictoffset */
8999 0, /* tp_init */
9000 0, /* tp_alloc */
9001 unicode_new, /* tp_new */
9002 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003};
9004
9005/* Initialize the Unicode implementation */
9006
Thomas Wouters78890102000-07-22 19:25:51 +00009007void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009009 int i;
9010
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009011 /* XXX - move this array to unicodectype.c ? */
9012 Py_UNICODE linebreak[] = {
9013 0x000A, /* LINE FEED */
9014 0x000D, /* CARRIAGE RETURN */
9015 0x001C, /* FILE SEPARATOR */
9016 0x001D, /* GROUP SEPARATOR */
9017 0x001E, /* RECORD SEPARATOR */
9018 0x0085, /* NEXT LINE */
9019 0x2028, /* LINE SEPARATOR */
9020 0x2029, /* PARAGRAPH SEPARATOR */
9021 };
9022
Fred Drakee4315f52000-05-09 19:53:39 +00009023 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009024 free_list = NULL;
9025 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009027 if (!unicode_empty)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009028 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009029
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009030 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009031 for (i = 0; i < 256; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009032 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009033 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009034 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009035
9036 /* initialize the linebreak bloom filter */
9037 bloom_linebreak = make_bloom_mask(
9038 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9039 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009040
9041 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042}
9043
9044/* Finalize the Unicode implementation */
9045
Christian Heimes3b718a72008-02-14 12:47:33 +00009046int
9047PyUnicode_ClearFreeList(void)
9048{
9049 int freelist_size = numfree;
9050 PyUnicodeObject *u;
9051
9052 for (u = free_list; u != NULL;) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009053 PyUnicodeObject *v = u;
9054 u = *(PyUnicodeObject **)u;
9055 if (v->str)
9056 PyObject_DEL(v->str);
9057 Py_XDECREF(v->defenc);
9058 PyObject_Del(v);
9059 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009060 }
9061 free_list = NULL;
9062 assert(numfree == 0);
9063 return freelist_size;
9064}
9065
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066void
Thomas Wouters78890102000-07-22 19:25:51 +00009067_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009069 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009071 Py_XDECREF(unicode_empty);
9072 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009073
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009074 for (i = 0; i < 256; i++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009075 if (unicode_latin1[i]) {
9076 Py_DECREF(unicode_latin1[i]);
9077 unicode_latin1[i] = NULL;
9078 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009079 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009080 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009082
Anthony Baxterac6bd462006-04-13 02:06:09 +00009083#ifdef __cplusplus
9084}
9085#endif
9086
9087
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009088/*
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009089 Local variables:
9090 c-basic-offset: 4
9091 indent-tabs-mode: nil
9092 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009093*/