blob: 524859c1116d6ba531d366cf3ae7b5c421774c90 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118// case 0x0009: /* HORIZONTAL TABULATION */
119// case 0x000A: /* LINE FEED */
120// case 0x000B: /* VERTICAL TABULATION */
121// case 0x000C: /* FORM FEED */
122// case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125// case 0x001C: /* FILE SEPARATOR */
126// case 0x001D: /* GROUP SEPARATOR */
127// case 0x001E: /* RECORD SEPARATOR */
128// case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130// case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149// 0x000A, /* LINE FEED */
150// 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153// 0x001C, /* FILE SEPARATOR */
154// 0x001D, /* GROUP SEPARATOR */
155// 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177 return 0x10FFFF;
178#else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Christian Heimes4d4f2702008-01-30 11:32:37 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
228#define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000317 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000318 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 }
320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
337 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000369 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 unicode->str = NULL;
371 unicode->length = 0;
372 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000383 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000384 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Martin v. Löwis18e16552006-02-15 17:27:45 +0000389int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000390{
391 register PyUnicodeObject *v;
392
393 /* Argument checks */
394 if (unicode == NULL) {
395 PyErr_BadInternalCall();
396 return -1;
397 }
398 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000399 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 PyErr_BadInternalCall();
401 return -1;
402 }
403
404 /* Resizing unicode_empty and single character objects is not
405 possible since these are being shared. We simply return a fresh
406 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000407 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 (v == unicode_empty || v->length == 1)) {
409 PyUnicodeObject *w = _PyUnicode_New(length);
410 if (w == NULL)
411 return -1;
412 Py_UNICODE_COPY(w->str, v->str,
413 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000414 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415 *unicode = (PyObject *)w;
416 return 0;
417 }
418
419 /* Note that we don't have to modify *unicode for unshared Unicode
420 objects, since we can modify them in-place. */
421 return unicode_resize(v, length);
422}
423
424/* Internal API for use in unicodeobject.c only ! */
425#define _PyUnicode_Resize(unicodevar, length) \
426 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000429 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430{
431 PyUnicodeObject *unicode;
432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 /* If the Unicode data is known at construction time, we can apply
434 some optimizations which share commonly used objects. */
435 if (u != NULL) {
436
437 /* Optimization for empty strings */
438 if (size == 0 && unicode_empty != NULL) {
439 Py_INCREF(unicode_empty);
440 return (PyObject *)unicode_empty;
441 }
442
443 /* Single character Unicode objects in the Latin-1 range are
444 shared when using this constructor */
445 if (size == 1 && *u < 256) {
446 unicode = unicode_latin1[*u];
447 if (!unicode) {
448 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 if (!unicode)
450 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000451 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452 unicode_latin1[*u] = unicode;
453 }
454 Py_INCREF(unicode);
455 return (PyObject *)unicode;
456 }
457 }
Tim Petersced69f82003-09-16 20:30:58 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 unicode = _PyUnicode_New(size);
460 if (!unicode)
461 return NULL;
462
463 /* Copy the Unicode data into the new object */
464 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000465 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000466
467 return (PyObject *)unicode;
468}
469
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000470PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
471{
472 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000473
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000474 if (size < 0) {
475 PyErr_SetString(PyExc_SystemError,
476 "Negative size passed to PyUnicode_FromStringAndSize");
477 return NULL;
478 }
479
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects.
482 Also, this means the input must be UTF-8, so fall back to the
483 UTF-8 decoder at the end. */
484 if (u != NULL) {
485
486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
490 }
491
492 /* Single characters are shared when using this constructor.
493 Restrict to ASCII, since the input must be UTF-8. */
494 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000495 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000501 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
506
507 return PyUnicode_DecodeUTF8(u, size, NULL);
508 }
509
510 unicode = _PyUnicode_New(size);
511 if (!unicode)
512 return NULL;
513
514 return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromString(const char *u)
518{
519 size_t size = strlen(u);
520 if (size > PY_SSIZE_T_MAX) {
521 PyErr_SetString(PyExc_OverflowError, "input too long");
522 return NULL;
523 }
524
525 return PyUnicode_FromStringAndSize(u, size);
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528#ifdef HAVE_WCHAR_H
529
530PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000531 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532{
533 PyUnicodeObject *unicode;
534
535 if (w == NULL) {
536 PyErr_BadInternalCall();
537 return NULL;
538 }
539
540 unicode = _PyUnicode_New(size);
541 if (!unicode)
542 return NULL;
543
544 /* Copy the wchar_t data into the new object */
545#ifdef HAVE_USABLE_WCHAR_T
546 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000547#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 {
549 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000550 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000552 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 *u++ = *w++;
554 }
555#endif
556
557 return (PyObject *)unicode;
558}
559
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000560static void
561makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
562{
563 *fmt++ = '%';
564 if (width) {
565 if (zeropad)
566 *fmt++ = '0';
567 fmt += sprintf(fmt, "%d", width);
568 }
569 if (precision)
570 fmt += sprintf(fmt, ".%d", precision);
571 if (longflag)
572 *fmt++ = 'l';
573 else if (size_tflag) {
574 char *f = PY_FORMAT_SIZE_T;
575 while (*f)
576 *fmt++ = *f++;
577 }
578 *fmt++ = c;
579 *fmt = '\0';
580}
581
582#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
583
584PyObject *
585PyUnicode_FromFormatV(const char *format, va_list vargs)
586{
587 va_list count;
588 Py_ssize_t callcount = 0;
589 PyObject **callresults = NULL;
590 PyObject **callresult = NULL;
591 Py_ssize_t n = 0;
592 int width = 0;
593 int precision = 0;
594 int zeropad;
595 const char* f;
596 Py_UNICODE *s;
597 PyObject *string;
598 /* used by sprintf */
599 char buffer[21];
600 /* use abuffer instead of buffer, if we need more space
601 * (which can happen if there's a format specifier with width). */
602 char *abuffer = NULL;
603 char *realbuffer;
604 Py_ssize_t abuffersize = 0;
605 char fmt[60]; /* should be enough for %0width.precisionld */
606 const char *copy;
607
608#ifdef VA_LIST_IS_ARRAY
609 Py_MEMCPY(count, vargs, sizeof(va_list));
610#else
611#ifdef __va_copy
612 __va_copy(count, vargs);
613#else
614 count = vargs;
615#endif
616#endif
617 /* step 1: count the number of %S/%R format specifications
618 * (we call PyObject_Str()/PyObject_Repr() for these objects
619 * once during step 3 and put the result in an array) */
620 for (f = format; *f; f++) {
621 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
622 ++callcount;
623 }
624 /* step 2: allocate memory for the results of
625 * PyObject_Str()/PyObject_Repr() calls */
626 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000627 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000628 if (!callresults) {
629 PyErr_NoMemory();
630 return NULL;
631 }
632 callresult = callresults;
633 }
634 /* step 3: figure out how large a buffer we need */
635 for (f = format; *f; f++) {
636 if (*f == '%') {
637 const char* p = f;
638 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000639 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000641 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000642 ;
643
644 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
645 * they don't affect the amount of space we reserve.
646 */
647 if ((*f == 'l' || *f == 'z') &&
648 (f[1] == 'd' || f[1] == 'u'))
649 ++f;
650
651 switch (*f) {
652 case 'c':
653 (void)va_arg(count, int);
654 /* fall through... */
655 case '%':
656 n++;
657 break;
658 case 'd': case 'u': case 'i': case 'x':
659 (void) va_arg(count, int);
660 /* 20 bytes is enough to hold a 64-bit
661 integer. Decimal takes the most space.
662 This isn't enough for octal.
663 If a width is specified we need more
664 (which we allocate later). */
665 if (width < 20)
666 width = 20;
667 n += width;
668 if (abuffersize < width)
669 abuffersize = width;
670 break;
671 case 's':
672 {
673 /* UTF-8 */
674 unsigned char*s;
675 s = va_arg(count, unsigned char*);
676 while (*s) {
677 if (*s < 128) {
678 n++; s++;
679 } else if (*s < 0xc0) {
680 /* invalid UTF-8 */
681 n++; s++;
682 } else if (*s < 0xc0) {
683 n++;
684 s++; if(!*s)break;
685 s++;
686 } else if (*s < 0xe0) {
687 n++;
688 s++; if(!*s)break;
689 s++; if(!*s)break;
690 s++;
691 } else {
692 #ifdef Py_UNICODE_WIDE
693 n++;
694 #else
695 n+=2;
696 #endif
697 s++; if(!*s)break;
698 s++; if(!*s)break;
699 s++; if(!*s)break;
700 s++;
701 }
702 }
703 break;
704 }
705 case 'U':
706 {
707 PyObject *obj = va_arg(count, PyObject *);
708 assert(obj && PyUnicode_Check(obj));
709 n += PyUnicode_GET_SIZE(obj);
710 break;
711 }
712 case 'V':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 const char *str = va_arg(count, const char *);
716 assert(obj || str);
717 assert(!obj || PyUnicode_Check(obj));
718 if (obj)
719 n += PyUnicode_GET_SIZE(obj);
720 else
721 n += strlen(str);
722 break;
723 }
724 case 'S':
725 {
726 PyObject *obj = va_arg(count, PyObject *);
727 PyObject *str;
728 assert(obj);
729 str = PyObject_Str(obj);
730 if (!str)
731 goto fail;
732 n += PyUnicode_GET_SIZE(str);
733 /* Remember the str and switch to the next slot */
734 *callresult++ = str;
735 break;
736 }
737 case 'R':
738 {
739 PyObject *obj = va_arg(count, PyObject *);
740 PyObject *repr;
741 assert(obj);
742 repr = PyObject_Repr(obj);
743 if (!repr)
744 goto fail;
745 n += PyUnicode_GET_SIZE(repr);
746 /* Remember the repr and switch to the next slot */
747 *callresult++ = repr;
748 break;
749 }
750 case 'p':
751 (void) va_arg(count, int);
752 /* maximum 64-bit pointer representation:
753 * 0xffffffffffffffff
754 * so 19 characters is enough.
755 * XXX I count 18 -- what's the extra for?
756 */
757 n += 19;
758 break;
759 default:
760 /* if we stumble upon an unknown
761 formatting code, copy the rest of
762 the format string to the output
763 string. (we cannot just skip the
764 code, since there's no way to know
765 what's in the argument list) */
766 n += strlen(p);
767 goto expand;
768 }
769 } else
770 n++;
771 }
772 expand:
773 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000774 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000775 if (!abuffer) {
776 PyErr_NoMemory();
777 goto fail;
778 }
779 realbuffer = abuffer;
780 }
781 else
782 realbuffer = buffer;
783 /* step 4: fill the buffer */
784 /* Since we've analyzed how much space we need for the worst case,
785 we don't have to resize the string.
786 There can be no errors beyond this point. */
787 string = PyUnicode_FromUnicode(NULL, n);
788 if (!string)
789 goto fail;
790
791 s = PyUnicode_AS_UNICODE(string);
792 callresult = callresults;
793
794 for (f = format; *f; f++) {
795 if (*f == '%') {
796 const char* p = f++;
797 int longflag = 0;
798 int size_tflag = 0;
799 zeropad = (*f == '0');
800 /* parse the width.precision part */
801 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000802 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000803 width = (width*10) + *f++ - '0';
804 precision = 0;
805 if (*f == '.') {
806 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000807 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000808 precision = (precision*10) + *f++ - '0';
809 }
810 /* handle the long flag, but only for %ld and %lu.
811 others can be added when necessary. */
812 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
813 longflag = 1;
814 ++f;
815 }
816 /* handle the size_t flag. */
817 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
818 size_tflag = 1;
819 ++f;
820 }
821
822 switch (*f) {
823 case 'c':
824 *s++ = va_arg(vargs, int);
825 break;
826 case 'd':
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
828 if (longflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, long));
830 else if (size_tflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
832 else
833 sprintf(realbuffer, fmt, va_arg(vargs, int));
834 appendstring(realbuffer);
835 break;
836 case 'u':
837 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
838 if (longflag)
839 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
840 else if (size_tflag)
841 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
842 else
843 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
844 appendstring(realbuffer);
845 break;
846 case 'i':
847 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
848 sprintf(realbuffer, fmt, va_arg(vargs, int));
849 appendstring(realbuffer);
850 break;
851 case 'x':
852 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
853 sprintf(realbuffer, fmt, va_arg(vargs, int));
854 appendstring(realbuffer);
855 break;
856 case 's':
857 {
858 /* Parameter must be UTF-8 encoded.
859 In case of encoding errors, use
860 the replacement character. */
861 PyObject *u;
862 p = va_arg(vargs, char*);
863 u = PyUnicode_DecodeUTF8(p, strlen(p),
864 "replace");
865 if (!u)
866 goto fail;
867 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
868 PyUnicode_GET_SIZE(u));
869 s += PyUnicode_GET_SIZE(u);
870 Py_DECREF(u);
871 break;
872 }
873 case 'U':
874 {
875 PyObject *obj = va_arg(vargs, PyObject *);
876 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
877 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
878 s += size;
879 break;
880 }
881 case 'V':
882 {
883 PyObject *obj = va_arg(vargs, PyObject *);
884 const char *str = va_arg(vargs, const char *);
885 if (obj) {
886 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
887 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
888 s += size;
889 } else {
890 appendstring(str);
891 }
892 break;
893 }
894 case 'S':
895 case 'R':
896 {
897 Py_UNICODE *ucopy;
898 Py_ssize_t usize;
899 Py_ssize_t upos;
900 /* unused, since we already have the result */
901 (void) va_arg(vargs, PyObject *);
902 ucopy = PyUnicode_AS_UNICODE(*callresult);
903 usize = PyUnicode_GET_SIZE(*callresult);
904 for (upos = 0; upos<usize;)
905 *s++ = ucopy[upos++];
906 /* We're done with the unicode()/repr() => forget it */
907 Py_DECREF(*callresult);
908 /* switch to next unicode()/repr() result */
909 ++callresult;
910 break;
911 }
912 case 'p':
913 sprintf(buffer, "%p", va_arg(vargs, void*));
914 /* %p is ill-defined: ensure leading 0x. */
915 if (buffer[1] == 'X')
916 buffer[1] = 'x';
917 else if (buffer[1] != 'x') {
918 memmove(buffer+2, buffer, strlen(buffer)+1);
919 buffer[0] = '0';
920 buffer[1] = 'x';
921 }
922 appendstring(buffer);
923 break;
924 case '%':
925 *s++ = '%';
926 break;
927 default:
928 appendstring(p);
929 goto end;
930 }
931 } else
932 *s++ = *f;
933 }
934
935 end:
936 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000937 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000939 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000940 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
941 return string;
942 fail:
943 if (callresults) {
944 PyObject **callresult2 = callresults;
945 while (callresult2 < callresult) {
946 Py_DECREF(*callresult2);
947 ++callresult2;
948 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000949 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000950 }
951 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000952 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000953 return NULL;
954}
955
956#undef appendstring
957
958PyObject *
959PyUnicode_FromFormat(const char *format, ...)
960{
961 PyObject* ret;
962 va_list vargs;
963
964#ifdef HAVE_STDARG_PROTOTYPES
965 va_start(vargs, format);
966#else
967 va_start(vargs);
968#endif
969 ret = PyUnicode_FromFormatV(format, vargs);
970 va_end(vargs);
971 return ret;
972}
973
Martin v. Löwis18e16552006-02-15 17:27:45 +0000974Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
975 wchar_t *w,
976 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977{
978 if (unicode == NULL) {
979 PyErr_BadInternalCall();
980 return -1;
981 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000982
983 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000984 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000985 size = PyUnicode_GET_SIZE(unicode) + 1;
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987#ifdef HAVE_USABLE_WCHAR_T
988 memcpy(w, unicode->str, size * sizeof(wchar_t));
989#else
990 {
991 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000992 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000994 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000995 *w++ = *u++;
996 }
997#endif
998
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000999 if (size > PyUnicode_GET_SIZE(unicode))
1000 return PyUnicode_GET_SIZE(unicode);
1001 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002 return size;
1003}
1004
1005#endif
1006
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001007PyObject *PyUnicode_FromOrdinal(int ordinal)
1008{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001009 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001010
1011#ifdef Py_UNICODE_WIDE
1012 if (ordinal < 0 || ordinal > 0x10ffff) {
1013 PyErr_SetString(PyExc_ValueError,
1014 "unichr() arg not in range(0x110000) "
1015 "(wide Python build)");
1016 return NULL;
1017 }
1018#else
1019 if (ordinal < 0 || ordinal > 0xffff) {
1020 PyErr_SetString(PyExc_ValueError,
1021 "unichr() arg not in range(0x10000) "
1022 "(narrow Python build)");
1023 return NULL;
1024 }
1025#endif
1026
Hye-Shik Chang40574832004-04-06 07:24:51 +00001027 s[0] = (Py_UNICODE)ordinal;
1028 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031PyObject *PyUnicode_FromObject(register PyObject *obj)
1032{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001033 /* XXX Perhaps we should make this API an alias of
1034 PyObject_Unicode() instead ?! */
1035 if (PyUnicode_CheckExact(obj)) {
1036 Py_INCREF(obj);
1037 return obj;
1038 }
1039 if (PyUnicode_Check(obj)) {
1040 /* For a Unicode subtype that's not a Unicode object,
1041 return a true Unicode object with the same data. */
1042 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1043 PyUnicode_GET_SIZE(obj));
1044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1046}
1047
1048PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1049 const char *encoding,
1050 const char *errors)
1051{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001052 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001053 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001054 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 if (obj == NULL) {
1057 PyErr_BadInternalCall();
1058 return NULL;
1059 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001060
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001061#if 0
1062 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001063 that no encodings is given and then redirect to
1064 PyObject_Unicode() which then applies the additional logic for
1065 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001066
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067 NOTE: This API should really only be used for object which
1068 represent *encoded* Unicode !
1069
1070 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001071 if (PyUnicode_Check(obj)) {
1072 if (encoding) {
1073 PyErr_SetString(PyExc_TypeError,
1074 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001075 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001076 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001078 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079#else
1080 if (PyUnicode_Check(obj)) {
1081 PyErr_SetString(PyExc_TypeError,
1082 "decoding Unicode is not supported");
1083 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001084 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001085#endif
1086
1087 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001088 if (PyString_Check(obj)) {
1089 s = PyString_AS_STRING(obj);
1090 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001091 }
Christian Heimes3497f942008-05-26 12:29:14 +00001092 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001093 /* Python 2.x specific */
1094 PyErr_Format(PyExc_TypeError,
1095 "decoding bytearray is not supported");
1096 return NULL;
1097 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001098 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1099 /* Overwrite the error message with something more useful in
1100 case of a TypeError. */
1101 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001102 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001103 "coercing to Unicode: need string or buffer, "
1104 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001105 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 goto onError;
1107 }
Tim Petersced69f82003-09-16 20:30:58 +00001108
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (len == 0) {
1111 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001112 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
Tim Petersced69f82003-09-16 20:30:58 +00001114 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001116
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117 return v;
1118
1119 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121}
1122
1123PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001124 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 const char *encoding,
1126 const char *errors)
1127{
1128 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001129
1130 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001131 encoding = PyUnicode_GetDefaultEncoding();
1132
1133 /* Shortcuts for common default encodings */
1134 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001136 else if (strcmp(encoding, "latin-1") == 0)
1137 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001138#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1139 else if (strcmp(encoding, "mbcs") == 0)
1140 return PyUnicode_DecodeMBCS(s, size, errors);
1141#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001142 else if (strcmp(encoding, "ascii") == 0)
1143 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144
1145 /* Decode via the codec registry */
1146 buffer = PyBuffer_FromMemory((void *)s, size);
1147 if (buffer == NULL)
1148 goto onError;
1149 unicode = PyCodec_Decode(buffer, encoding, errors);
1150 if (unicode == NULL)
1151 goto onError;
1152 if (!PyUnicode_Check(unicode)) {
1153 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001154 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001155 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 Py_DECREF(unicode);
1157 goto onError;
1158 }
1159 Py_DECREF(buffer);
1160 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001161
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 onError:
1163 Py_XDECREF(buffer);
1164 return NULL;
1165}
1166
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001167PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1168 const char *encoding,
1169 const char *errors)
1170{
1171 PyObject *v;
1172
1173 if (!PyUnicode_Check(unicode)) {
1174 PyErr_BadArgument();
1175 goto onError;
1176 }
1177
1178 if (encoding == NULL)
1179 encoding = PyUnicode_GetDefaultEncoding();
1180
1181 /* Decode via the codec registry */
1182 v = PyCodec_Decode(unicode, encoding, errors);
1183 if (v == NULL)
1184 goto onError;
1185 return v;
1186
1187 onError:
1188 return NULL;
1189}
1190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001192 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 const char *encoding,
1194 const char *errors)
1195{
1196 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001197
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 unicode = PyUnicode_FromUnicode(s, size);
1199 if (unicode == NULL)
1200 return NULL;
1201 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1202 Py_DECREF(unicode);
1203 return v;
1204}
1205
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001206PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1207 const char *encoding,
1208 const char *errors)
1209{
1210 PyObject *v;
1211
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_BadArgument();
1214 goto onError;
1215 }
1216
1217 if (encoding == NULL)
1218 encoding = PyUnicode_GetDefaultEncoding();
1219
1220 /* Encode via the codec registry */
1221 v = PyCodec_Encode(unicode, encoding, errors);
1222 if (v == NULL)
1223 goto onError;
1224 return v;
1225
1226 onError:
1227 return NULL;
1228}
1229
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1231 const char *encoding,
1232 const char *errors)
1233{
1234 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001235
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1239 }
Fred Drakee4315f52000-05-09 19:53:39 +00001240
Tim Petersced69f82003-09-16 20:30:58 +00001241 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001242 encoding = PyUnicode_GetDefaultEncoding();
1243
1244 /* Shortcuts for common default encodings */
1245 if (errors == NULL) {
1246 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001247 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001248 else if (strcmp(encoding, "latin-1") == 0)
1249 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding, "mbcs") == 0)
1252 return PyUnicode_AsMBCSString(unicode);
1253#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001254 else if (strcmp(encoding, "ascii") == 0)
1255 return PyUnicode_AsASCIIString(unicode);
1256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257
1258 /* Encode via the codec registry */
1259 v = PyCodec_Encode(unicode, encoding, errors);
1260 if (v == NULL)
1261 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001262 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001264 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001265 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 Py_DECREF(v);
1267 goto onError;
1268 }
1269 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001270
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 onError:
1272 return NULL;
1273}
1274
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001275PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1276 const char *errors)
1277{
1278 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1279
1280 if (v)
1281 return v;
1282 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1283 if (v && errors == NULL)
1284 ((PyUnicodeObject *)unicode)->defenc = v;
1285 return v;
1286}
1287
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1289{
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294 return PyUnicode_AS_UNICODE(unicode);
1295
1296 onError:
1297 return NULL;
1298}
1299
Martin v. Löwis18e16552006-02-15 17:27:45 +00001300Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301{
1302 if (!PyUnicode_Check(unicode)) {
1303 PyErr_BadArgument();
1304 goto onError;
1305 }
1306 return PyUnicode_GET_SIZE(unicode);
1307
1308 onError:
1309 return -1;
1310}
1311
Thomas Wouters78890102000-07-22 19:25:51 +00001312const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001313{
1314 return unicode_default_encoding;
1315}
1316
1317int PyUnicode_SetDefaultEncoding(const char *encoding)
1318{
1319 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001320
Fred Drakee4315f52000-05-09 19:53:39 +00001321 /* Make sure the encoding is valid. As side effect, this also
1322 loads the encoding into the codec registry cache. */
1323 v = _PyCodec_Lookup(encoding);
1324 if (v == NULL)
1325 goto onError;
1326 Py_DECREF(v);
1327 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001328 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001329 sizeof(unicode_default_encoding));
1330 return 0;
1331
1332 onError:
1333 return -1;
1334}
1335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001336/* error handling callback helper:
1337 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001338 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 and adjust various state variables.
1340 return 0 on success, -1 on error
1341*/
1342
1343static
1344int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1345 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001346 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1347 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001348 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001350 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351
1352 PyObject *restuple = NULL;
1353 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001354 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1355 Py_ssize_t requiredsize;
1356 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 int res = -1;
1360
1361 if (*errorHandler == NULL) {
1362 *errorHandler = PyCodec_LookupError(errors);
1363 if (*errorHandler == NULL)
1364 goto onError;
1365 }
1366
1367 if (*exceptionObject == NULL) {
1368 *exceptionObject = PyUnicodeDecodeError_Create(
1369 encoding, input, insize, *startinpos, *endinpos, reason);
1370 if (*exceptionObject == NULL)
1371 goto onError;
1372 }
1373 else {
1374 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1375 goto onError;
1376 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1377 goto onError;
1378 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1379 goto onError;
1380 }
1381
1382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1383 if (restuple == NULL)
1384 goto onError;
1385 if (!PyTuple_Check(restuple)) {
1386 PyErr_Format(PyExc_TypeError, &argparse[4]);
1387 goto onError;
1388 }
1389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1390 goto onError;
1391 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001392 newpos = insize+newpos;
1393 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001394 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001395 goto onError;
1396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397
1398 /* need more space? (at least enough for what we
1399 have+the replacement+the rest of the string (starting
1400 at the new input position), so we won't have to check space
1401 when there are no errors in the rest of the string) */
1402 repptr = PyUnicode_AS_UNICODE(repunicode);
1403 repsize = PyUnicode_GET_SIZE(repunicode);
1404 requiredsize = *outpos + repsize + insize-newpos;
1405 if (requiredsize > outsize) {
1406 if (requiredsize<2*outsize)
1407 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001408 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 goto onError;
1410 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1411 }
1412 *endinpos = newpos;
1413 *inptr = input + newpos;
1414 Py_UNICODE_COPY(*outptr, repptr, repsize);
1415 *outptr += repsize;
1416 *outpos += repsize;
1417 /* we made it! */
1418 res = 0;
1419
1420 onError:
1421 Py_XDECREF(restuple);
1422 return res;
1423}
1424
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425/* --- UTF-7 Codec -------------------------------------------------------- */
1426
1427/* see RFC2152 for details */
1428
Tim Petersced69f82003-09-16 20:30:58 +00001429static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430char utf7_special[128] = {
1431 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1432 encoded:
1433 0 - not special
1434 1 - special
1435 2 - whitespace (optional)
1436 3 - RFC2152 Set O (optional) */
1437 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1439 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1441 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1445
1446};
1447
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001448/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1449 warnings about the comparison always being false; since
1450 utf7_special[0] is 1, we can safely make that one comparison
1451 true */
1452
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001454 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001455 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 (encodeO && (utf7_special[(c)] == 3)))
1457
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001458#define B64(n) \
1459 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1460#define B64CHAR(c) \
1461 (isalnum(c) || (c) == '+' || (c) == '/')
1462#define UB64(c) \
1463 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1464 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001466#define ENCODE(out, ch, bits) \
1467 while (bits >= 6) { \
1468 *out++ = B64(ch >> (bits-6)); \
1469 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 }
1471
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001472#define DECODE(out, ch, bits, surrogate) \
1473 while (bits >= 16) { \
1474 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1475 bits -= 16; \
1476 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001477 /* We have already generated an error for the high surrogate \
1478 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001479 surrogate = 0; \
1480 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001482 it in a 16-bit character */ \
1483 surrogate = 1; \
1484 errmsg = "code pairs are not supported"; \
1485 goto utf7Error; \
1486 } else { \
1487 *out++ = outCh; \
1488 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001490
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001492 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 const char *errors)
1494{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001495 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1496}
1497
1498PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1499 Py_ssize_t size,
1500 const char *errors,
1501 Py_ssize_t *consumed)
1502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001504 Py_ssize_t startinpos;
1505 Py_ssize_t endinpos;
1506 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001507 const char *e;
1508 PyUnicodeObject *unicode;
1509 Py_UNICODE *p;
1510 const char *errmsg = "";
1511 int inShift = 0;
1512 unsigned int bitsleft = 0;
1513 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001514 int surrogate = 0;
1515 PyObject *errorHandler = NULL;
1516 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517
1518 unicode = _PyUnicode_New(size);
1519 if (!unicode)
1520 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001521 if (size == 0) {
1522 if (consumed)
1523 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526
1527 p = unicode->str;
1528 e = s + size;
1529
1530 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531 Py_UNICODE ch;
1532 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001533 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534
1535 if (inShift) {
1536 if ((ch == '-') || !B64CHAR(ch)) {
1537 inShift = 0;
1538 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001539
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1541 if (bitsleft >= 6) {
1542 /* The shift sequence has a partial character in it. If
1543 bitsleft < 6 then we could just classify it as padding
1544 but that is not the case here */
1545
1546 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001547 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 }
1549 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001550 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 here so indicate the potential of a misencoded character. */
1552
1553 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1554 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1555 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001556 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 }
1558
1559 if (ch == '-') {
1560 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001561 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 inShift = 1;
1563 }
1564 } else if (SPECIAL(ch,0,0)) {
1565 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001566 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 } else {
1568 *p++ = ch;
1569 }
1570 } else {
1571 charsleft = (charsleft << 6) | UB64(ch);
1572 bitsleft += 6;
1573 s++;
1574 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1575 }
1576 }
1577 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 s++;
1580 if (s < e && *s == '-') {
1581 s++;
1582 *p++ = '+';
1583 } else
1584 {
1585 inShift = 1;
1586 bitsleft = 0;
1587 }
1588 }
1589 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001590 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 errmsg = "unexpected special character";
1592 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001593 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 }
1595 else {
1596 *p++ = ch;
1597 s++;
1598 }
1599 continue;
1600 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 endinpos = s-starts;
1603 if (unicode_decode_call_errorhandler(
1604 errors, &errorHandler,
1605 "utf7", errmsg,
1606 starts, size, &startinpos, &endinpos, &exc, &s,
1607 (PyObject **)&unicode, &outpos, &p))
1608 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 }
1610
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001611 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 outpos = p-PyUnicode_AS_UNICODE(unicode);
1613 endinpos = size;
1614 if (unicode_decode_call_errorhandler(
1615 errors, &errorHandler,
1616 "utf7", "unterminated shift sequence",
1617 starts, size, &startinpos, &endinpos, &exc, &s,
1618 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 if (s < e)
1621 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001623 if (consumed) {
1624 if(inShift)
1625 *consumed = startinpos;
1626 else
1627 *consumed = s-starts;
1628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001630 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631 goto onError;
1632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 return (PyObject *)unicode;
1636
1637onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_XDECREF(errorHandler);
1639 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 Py_DECREF(unicode);
1641 return NULL;
1642}
1643
1644
1645PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001646 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 int encodeSetO,
1648 int encodeWhiteSpace,
1649 const char *errors)
1650{
1651 PyObject *v;
1652 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001653 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001655 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 unsigned int bitsleft = 0;
1657 unsigned long charsleft = 0;
1658 char * out;
1659 char * start;
1660
Neal Norwitze7d8be82008-07-31 17:17:14 +00001661 if (cbAllocated / 5 != size)
1662 return PyErr_NoMemory();
1663
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 if (size == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001665 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001667 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 if (v == NULL)
1669 return NULL;
1670
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001671 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001672 for (;i < size; ++i) {
1673 Py_UNICODE ch = s[i];
1674
1675 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001676 if (ch == '+') {
1677 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678 *out++ = '-';
1679 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680 charsleft = ch;
1681 bitsleft = 16;
1682 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001683 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001684 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001685 } else {
1686 *out++ = (char) ch;
1687 }
1688 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1690 *out++ = B64(charsleft << (6-bitsleft));
1691 charsleft = 0;
1692 bitsleft = 0;
1693 /* Characters not in the BASE64 set implicitly unshift the sequence
1694 so no '-' is required, except if the character is itself a '-' */
1695 if (B64CHAR(ch) || ch == '-') {
1696 *out++ = '-';
1697 }
1698 inShift = 0;
1699 *out++ = (char) ch;
1700 } else {
1701 bitsleft += 16;
1702 charsleft = (charsleft << 16) | ch;
1703 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1704
1705 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001706 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 or '-' then the shift sequence will be terminated implicitly and we
1708 don't have to insert a '-'. */
1709
1710 if (bitsleft == 0) {
1711 if (i + 1 < size) {
1712 Py_UNICODE ch2 = s[i+1];
1713
1714 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 } else if (B64CHAR(ch2) || ch2 == '-') {
1717 *out++ = '-';
1718 inShift = 0;
1719 } else {
1720 inShift = 0;
1721 }
1722
1723 }
1724 else {
1725 *out++ = '-';
1726 inShift = 0;
1727 }
1728 }
Tim Petersced69f82003-09-16 20:30:58 +00001729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 if (bitsleft) {
1733 *out++= B64(charsleft << (6-bitsleft) );
1734 *out++ = '-';
1735 }
1736
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001737 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 return v;
1739}
1740
1741#undef SPECIAL
1742#undef B64
1743#undef B64CHAR
1744#undef UB64
1745#undef ENCODE
1746#undef DECODE
1747
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748/* --- UTF-8 Codec -------------------------------------------------------- */
1749
Tim Petersced69f82003-09-16 20:30:58 +00001750static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751char utf8_code_length[256] = {
1752 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1753 illegal prefix. see RFC 2279 for details */
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1767 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1768 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1769 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1770};
1771
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001773 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 const char *errors)
1775{
Walter Dörwald69652032004-09-07 20:24:22 +00001776 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1777}
1778
1779PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001780 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001781 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001783{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001786 Py_ssize_t startinpos;
1787 Py_ssize_t endinpos;
1788 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 const char *e;
1790 PyUnicodeObject *unicode;
1791 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793 PyObject *errorHandler = NULL;
1794 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795
1796 /* Note: size will always be longer than the resulting Unicode
1797 character count */
1798 unicode = _PyUnicode_New(size);
1799 if (!unicode)
1800 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001801 if (size == 0) {
1802 if (consumed)
1803 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Unpack UTF-8 encoded data */
1808 p = unicode->str;
1809 e = s + size;
1810
1811 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813
1814 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001815 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 s++;
1817 continue;
1818 }
1819
1820 n = utf8_code_length[ch];
1821
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001823 if (consumed)
1824 break;
1825 else {
1826 errmsg = "unexpected end of data";
1827 startinpos = s-starts;
1828 endinpos = size;
1829 goto utf8Error;
1830 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832
1833 switch (n) {
1834
1835 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001836 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 startinpos = s-starts;
1838 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001839 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840
1841 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 startinpos = s-starts;
1844 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846
1847 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 if ((s[1] & 0xc0) != 0x80) {
1849 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 startinpos = s-starts;
1851 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 goto utf8Error;
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 startinpos = s-starts;
1857 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 errmsg = "illegal encoding";
1859 goto utf8Error;
1860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 break;
1864
1865 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001866 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 (s[2] & 0xc0) != 0x80) {
1868 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 startinpos = s-starts;
1870 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001871 goto utf8Error;
1872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001874 if (ch < 0x0800) {
1875 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001876 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001877
1878 XXX For wide builds (UCS-4) we should probably try
1879 to recombine the surrogates into a single code
1880 unit.
1881 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001882 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 startinpos = s-starts;
1884 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001885 goto utf8Error;
1886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001888 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001889 break;
1890
1891 case 4:
1892 if ((s[1] & 0xc0) != 0x80 ||
1893 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001894 (s[3] & 0xc0) != 0x80) {
1895 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001896 startinpos = s-starts;
1897 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001898 goto utf8Error;
1899 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1901 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1902 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001903 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001904 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001906 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001908 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 startinpos = s-starts;
1910 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001911 goto utf8Error;
1912 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001913#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001914 *p++ = (Py_UNICODE)ch;
1915#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001917
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 /* translate from 10000..10FFFF to 0..FFFF */
1919 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001920
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001921 /* high surrogate = top 10 bits added to D800 */
1922 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001923
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001925 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001926#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 break;
1928
1929 default:
1930 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001931 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 startinpos = s-starts;
1933 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 }
1936 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001938
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 outpos = p-PyUnicode_AS_UNICODE(unicode);
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "utf8", errmsg,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&unicode, &outpos, &p))
1946 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 }
Walter Dörwald69652032004-09-07 20:24:22 +00001948 if (consumed)
1949 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950
1951 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001952 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 goto onError;
1954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 Py_XDECREF(errorHandler);
1956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 return (PyObject *)unicode;
1958
1959onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001960 Py_XDECREF(errorHandler);
1961 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 Py_DECREF(unicode);
1963 return NULL;
1964}
1965
Tim Peters602f7402002-04-27 18:03:26 +00001966/* Allocation strategy: if the string is short, convert into a stack buffer
1967 and allocate exactly as much space needed at the end. Else allocate the
1968 maximum possible needed (4 result bytes per Unicode character), and return
1969 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001970*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001971PyObject *
1972PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001974 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975{
Tim Peters602f7402002-04-27 18:03:26 +00001976#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001977
Martin v. Löwis18e16552006-02-15 17:27:45 +00001978 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001979 PyObject *v; /* result string object */
1980 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001981 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001982 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001983 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001984
Tim Peters602f7402002-04-27 18:03:26 +00001985 assert(s != NULL);
1986 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987
Tim Peters602f7402002-04-27 18:03:26 +00001988 if (size <= MAX_SHORT_UNICHARS) {
1989 /* Write into the stack buffer; nallocated can't overflow.
1990 * At the end, we'll allocate exactly as much heap space as it
1991 * turns out we need.
1992 */
1993 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1994 v = NULL; /* will allocate after we're done */
1995 p = stackbuf;
1996 }
1997 else {
1998 /* Overallocate on the heap, and give the excess back at the end. */
1999 nallocated = size * 4;
2000 if (nallocated / 4 != size) /* overflow! */
2001 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002002 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002003 if (v == NULL)
2004 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002005 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002006 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002007
Tim Peters602f7402002-04-27 18:03:26 +00002008 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002010
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002011 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002012 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002014
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002016 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002017 *p++ = (char)(0xc0 | (ch >> 6));
2018 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002019 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002020 else {
Tim Peters602f7402002-04-27 18:03:26 +00002021 /* Encode UCS2 Unicode ordinals */
2022 if (ch < 0x10000) {
2023 /* Special case: check for high surrogate */
2024 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2025 Py_UCS4 ch2 = s[i];
2026 /* Check for low surrogate and combine the two to
2027 form a UCS4 value */
2028 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002029 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002030 i++;
2031 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 }
Tim Peters602f7402002-04-27 18:03:26 +00002033 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002034 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002036 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2037 *p++ = (char)(0x80 | (ch & 0x3f));
2038 continue;
2039 }
2040encodeUCS4:
2041 /* Encode UCS4 Unicode ordinals */
2042 *p++ = (char)(0xf0 | (ch >> 18));
2043 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2044 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2045 *p++ = (char)(0x80 | (ch & 0x3f));
2046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002048
Tim Peters602f7402002-04-27 18:03:26 +00002049 if (v == NULL) {
2050 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002051 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002052 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002053 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002054 }
2055 else {
2056 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002057 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002058 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002059 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002062
Tim Peters602f7402002-04-27 18:03:26 +00002063#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064}
2065
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2067{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 if (!PyUnicode_Check(unicode)) {
2069 PyErr_BadArgument();
2070 return NULL;
2071 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002072 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2073 PyUnicode_GET_SIZE(unicode),
2074 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075}
2076
Walter Dörwald6e390802007-08-17 16:41:28 +00002077/* --- UTF-32 Codec ------------------------------------------------------- */
2078
2079PyObject *
2080PyUnicode_DecodeUTF32(const char *s,
2081 Py_ssize_t size,
2082 const char *errors,
2083 int *byteorder)
2084{
2085 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2086}
2087
2088PyObject *
2089PyUnicode_DecodeUTF32Stateful(const char *s,
2090 Py_ssize_t size,
2091 const char *errors,
2092 int *byteorder,
2093 Py_ssize_t *consumed)
2094{
2095 const char *starts = s;
2096 Py_ssize_t startinpos;
2097 Py_ssize_t endinpos;
2098 Py_ssize_t outpos;
2099 PyUnicodeObject *unicode;
2100 Py_UNICODE *p;
2101#ifndef Py_UNICODE_WIDE
2102 int i, pairs;
2103#else
2104 const int pairs = 0;
2105#endif
2106 const unsigned char *q, *e;
2107 int bo = 0; /* assume native ordering by default */
2108 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002109 /* Offsets from q for retrieving bytes in the right order. */
2110#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111 int iorder[] = {0, 1, 2, 3};
2112#else
2113 int iorder[] = {3, 2, 1, 0};
2114#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002115 PyObject *errorHandler = NULL;
2116 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002117 /* On narrow builds we split characters outside the BMP into two
2118 codepoints => count how much extra space we need. */
2119#ifndef Py_UNICODE_WIDE
2120 for (i = pairs = 0; i < size/4; i++)
2121 if (((Py_UCS4 *)s)[i] >= 0x10000)
2122 pairs++;
2123#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002124
2125 /* This might be one to much, because of a BOM */
2126 unicode = _PyUnicode_New((size+3)/4+pairs);
2127 if (!unicode)
2128 return NULL;
2129 if (size == 0)
2130 return (PyObject *)unicode;
2131
2132 /* Unpack UTF-32 encoded data */
2133 p = unicode->str;
2134 q = (unsigned char *)s;
2135 e = q + size;
2136
2137 if (byteorder)
2138 bo = *byteorder;
2139
2140 /* Check for BOM marks (U+FEFF) in the input and adjust current
2141 byte order setting accordingly. In native mode, the leading BOM
2142 mark is skipped, in all other modes, it is copied to the output
2143 stream as-is (giving a ZWNBSP character). */
2144 if (bo == 0) {
2145 if (size >= 4) {
2146 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2147 (q[iorder[1]] << 8) | q[iorder[0]];
2148#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2149 if (bom == 0x0000FEFF) {
2150 q += 4;
2151 bo = -1;
2152 }
2153 else if (bom == 0xFFFE0000) {
2154 q += 4;
2155 bo = 1;
2156 }
2157#else
2158 if (bom == 0x0000FEFF) {
2159 q += 4;
2160 bo = 1;
2161 }
2162 else if (bom == 0xFFFE0000) {
2163 q += 4;
2164 bo = -1;
2165 }
2166#endif
2167 }
2168 }
2169
2170 if (bo == -1) {
2171 /* force LE */
2172 iorder[0] = 0;
2173 iorder[1] = 1;
2174 iorder[2] = 2;
2175 iorder[3] = 3;
2176 }
2177 else if (bo == 1) {
2178 /* force BE */
2179 iorder[0] = 3;
2180 iorder[1] = 2;
2181 iorder[2] = 1;
2182 iorder[3] = 0;
2183 }
2184
2185 while (q < e) {
2186 Py_UCS4 ch;
2187 /* remaining bytes at the end? (size should be divisible by 4) */
2188 if (e-q<4) {
2189 if (consumed)
2190 break;
2191 errmsg = "truncated data";
2192 startinpos = ((const char *)q)-starts;
2193 endinpos = ((const char *)e)-starts;
2194 goto utf32Error;
2195 /* The remaining input chars are ignored if the callback
2196 chooses to skip the input */
2197 }
2198 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2199 (q[iorder[1]] << 8) | q[iorder[0]];
2200
2201 if (ch >= 0x110000)
2202 {
2203 errmsg = "codepoint not in range(0x110000)";
2204 startinpos = ((const char *)q)-starts;
2205 endinpos = startinpos+4;
2206 goto utf32Error;
2207 }
2208#ifndef Py_UNICODE_WIDE
2209 if (ch >= 0x10000)
2210 {
2211 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2212 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2213 }
2214 else
2215#endif
2216 *p++ = ch;
2217 q += 4;
2218 continue;
2219 utf32Error:
2220 outpos = p-PyUnicode_AS_UNICODE(unicode);
2221 if (unicode_decode_call_errorhandler(
2222 errors, &errorHandler,
2223 "utf32", errmsg,
2224 starts, size, &startinpos, &endinpos, &exc, &s,
2225 (PyObject **)&unicode, &outpos, &p))
2226 goto onError;
2227 }
2228
2229 if (byteorder)
2230 *byteorder = bo;
2231
2232 if (consumed)
2233 *consumed = (const char *)q-starts;
2234
2235 /* Adjust length */
2236 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2237 goto onError;
2238
2239 Py_XDECREF(errorHandler);
2240 Py_XDECREF(exc);
2241 return (PyObject *)unicode;
2242
2243onError:
2244 Py_DECREF(unicode);
2245 Py_XDECREF(errorHandler);
2246 Py_XDECREF(exc);
2247 return NULL;
2248}
2249
2250PyObject *
2251PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2252 Py_ssize_t size,
2253 const char *errors,
2254 int byteorder)
2255{
2256 PyObject *v;
2257 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002258 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002260 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002261#else
2262 const int pairs = 0;
2263#endif
2264 /* Offsets from p for storing byte pairs in the right order. */
2265#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2266 int iorder[] = {0, 1, 2, 3};
2267#else
2268 int iorder[] = {3, 2, 1, 0};
2269#endif
2270
2271#define STORECHAR(CH) \
2272 do { \
2273 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2274 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2275 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2276 p[iorder[0]] = (CH) & 0xff; \
2277 p += 4; \
2278 } while(0)
2279
2280 /* In narrow builds we can output surrogate pairs as one codepoint,
2281 so we need less space. */
2282#ifndef Py_UNICODE_WIDE
2283 for (i = pairs = 0; i < size-1; i++)
2284 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2285 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2286 pairs++;
2287#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002288 nsize = (size - pairs + (byteorder == 0));
2289 bytesize = nsize * 4;
2290 if (bytesize / 4 != nsize)
2291 return PyErr_NoMemory();
2292 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002293 if (v == NULL)
2294 return NULL;
2295
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002296 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002297 if (byteorder == 0)
2298 STORECHAR(0xFEFF);
2299 if (size == 0)
2300 return v;
2301
2302 if (byteorder == -1) {
2303 /* force LE */
2304 iorder[0] = 0;
2305 iorder[1] = 1;
2306 iorder[2] = 2;
2307 iorder[3] = 3;
2308 }
2309 else if (byteorder == 1) {
2310 /* force BE */
2311 iorder[0] = 3;
2312 iorder[1] = 2;
2313 iorder[2] = 1;
2314 iorder[3] = 0;
2315 }
2316
2317 while (size-- > 0) {
2318 Py_UCS4 ch = *s++;
2319#ifndef Py_UNICODE_WIDE
2320 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2321 Py_UCS4 ch2 = *s;
2322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2323 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2324 s++;
2325 size--;
2326 }
2327 }
2328#endif
2329 STORECHAR(ch);
2330 }
2331 return v;
2332#undef STORECHAR
2333}
2334
2335PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2336{
2337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadArgument();
2339 return NULL;
2340 }
2341 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2342 PyUnicode_GET_SIZE(unicode),
2343 NULL,
2344 0);
2345}
2346
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347/* --- UTF-16 Codec ------------------------------------------------------- */
2348
Tim Peters772747b2001-08-09 22:21:55 +00002349PyObject *
2350PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002351 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002352 const char *errors,
2353 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354{
Walter Dörwald69652032004-09-07 20:24:22 +00002355 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2356}
2357
2358PyObject *
2359PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002360 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002361 const char *errors,
2362 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002363 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002365 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002366 Py_ssize_t startinpos;
2367 Py_ssize_t endinpos;
2368 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 PyUnicodeObject *unicode;
2370 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002371 const unsigned char *q, *e;
2372 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002373 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002374 /* Offsets from q for retrieving byte pairs in the right order. */
2375#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2376 int ihi = 1, ilo = 0;
2377#else
2378 int ihi = 0, ilo = 1;
2379#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382
2383 /* Note: size will always be longer than the resulting Unicode
2384 character count */
2385 unicode = _PyUnicode_New(size);
2386 if (!unicode)
2387 return NULL;
2388 if (size == 0)
2389 return (PyObject *)unicode;
2390
2391 /* Unpack UTF-16 encoded data */
2392 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002393 q = (unsigned char *)s;
2394 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395
2396 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002397 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002399 /* Check for BOM marks (U+FEFF) in the input and adjust current
2400 byte order setting accordingly. In native mode, the leading BOM
2401 mark is skipped, in all other modes, it is copied to the output
2402 stream as-is (giving a ZWNBSP character). */
2403 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002404 if (size >= 2) {
2405 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002406#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002407 if (bom == 0xFEFF) {
2408 q += 2;
2409 bo = -1;
2410 }
2411 else if (bom == 0xFFFE) {
2412 q += 2;
2413 bo = 1;
2414 }
Tim Petersced69f82003-09-16 20:30:58 +00002415#else
Walter Dörwald69652032004-09-07 20:24:22 +00002416 if (bom == 0xFEFF) {
2417 q += 2;
2418 bo = 1;
2419 }
2420 else if (bom == 0xFFFE) {
2421 q += 2;
2422 bo = -1;
2423 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002424#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002425 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427
Tim Peters772747b2001-08-09 22:21:55 +00002428 if (bo == -1) {
2429 /* force LE */
2430 ihi = 1;
2431 ilo = 0;
2432 }
2433 else if (bo == 1) {
2434 /* force BE */
2435 ihi = 0;
2436 ilo = 1;
2437 }
2438
2439 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002440 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002441 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002443 if (consumed)
2444 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445 errmsg = "truncated data";
2446 startinpos = ((const char *)q)-starts;
2447 endinpos = ((const char *)e)-starts;
2448 goto utf16Error;
2449 /* The remaining input chars are ignored if the callback
2450 chooses to skip the input */
2451 }
2452 ch = (q[ihi] << 8) | q[ilo];
2453
Tim Peters772747b2001-08-09 22:21:55 +00002454 q += 2;
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 if (ch < 0xD800 || ch > 0xDFFF) {
2457 *p++ = ch;
2458 continue;
2459 }
2460
2461 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002462 if (q >= e) {
2463 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 startinpos = (((const char *)q)-2)-starts;
2465 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002466 goto utf16Error;
2467 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002468 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002469 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2470 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002472#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002473 *p++ = ch;
2474 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002475#else
2476 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002478 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002479 }
2480 else {
2481 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 startinpos = (((const char *)q)-4)-starts;
2483 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002484 goto utf16Error;
2485 }
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 startinpos = (((const char *)q)-2)-starts;
2490 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 /* Fall through to report the error */
2492
2493 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 outpos = p-PyUnicode_AS_UNICODE(unicode);
2495 if (unicode_decode_call_errorhandler(
2496 errors, &errorHandler,
2497 "utf16", errmsg,
2498 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2499 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002500 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 }
2502
2503 if (byteorder)
2504 *byteorder = bo;
2505
Walter Dörwald69652032004-09-07 20:24:22 +00002506 if (consumed)
2507 *consumed = (const char *)q-starts;
2508
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002510 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 goto onError;
2512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002513 Py_XDECREF(errorHandler);
2514 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 return (PyObject *)unicode;
2516
2517onError:
2518 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 Py_XDECREF(errorHandler);
2520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 return NULL;
2522}
2523
Tim Peters772747b2001-08-09 22:21:55 +00002524PyObject *
2525PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002526 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002527 const char *errors,
2528 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529{
2530 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002531 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002532 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002533#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002534 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#else
2536 const int pairs = 0;
2537#endif
Tim Peters772747b2001-08-09 22:21:55 +00002538 /* Offsets from p for storing byte pairs in the right order. */
2539#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2540 int ihi = 1, ilo = 0;
2541#else
2542 int ihi = 0, ilo = 1;
2543#endif
2544
2545#define STORECHAR(CH) \
2546 do { \
2547 p[ihi] = ((CH) >> 8) & 0xff; \
2548 p[ilo] = (CH) & 0xff; \
2549 p += 2; \
2550 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002552#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002553 for (i = pairs = 0; i < size; i++)
2554 if (s[i] >= 0x10000)
2555 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002556#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002557 /* 2 * (size + pairs + (byteorder == 0)) */
2558 if (size > PY_SSIZE_T_MAX ||
2559 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2560 return PyErr_NoMemory();
2561 nsize = size + pairs + (byteorder == 0);
2562 bytesize = nsize * 2;
2563 if (bytesize / 2 != nsize)
2564 return PyErr_NoMemory();
2565 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 if (v == NULL)
2567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002569 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002571 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002572 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002573 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002574
2575 if (byteorder == -1) {
2576 /* force LE */
2577 ihi = 1;
2578 ilo = 0;
2579 }
2580 else if (byteorder == 1) {
2581 /* force BE */
2582 ihi = 0;
2583 ilo = 1;
2584 }
2585
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586 while (size-- > 0) {
2587 Py_UNICODE ch = *s++;
2588 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002589#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002594#endif
Tim Peters772747b2001-08-09 22:21:55 +00002595 STORECHAR(ch);
2596 if (ch2)
2597 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002600#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601}
2602
2603PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2604{
2605 if (!PyUnicode_Check(unicode)) {
2606 PyErr_BadArgument();
2607 return NULL;
2608 }
2609 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2610 PyUnicode_GET_SIZE(unicode),
2611 NULL,
2612 0);
2613}
2614
2615/* --- Unicode Escape Codec ----------------------------------------------- */
2616
Fredrik Lundh06d12682001-01-24 07:59:11 +00002617static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 const char *errors)
2622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002624 Py_ssize_t startinpos;
2625 Py_ssize_t endinpos;
2626 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002631 char* message;
2632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 PyObject *errorHandler = NULL;
2634 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002635
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 /* Escaped strings will always be longer than the resulting
2637 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 length after conversion to the true value.
2639 (but if the error callback returns a long replacement string
2640 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 v = _PyUnicode_New(size);
2642 if (v == NULL)
2643 goto onError;
2644 if (size == 0)
2645 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002649
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 while (s < end) {
2651 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002652 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002653 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654
2655 /* Non-escape characters are interpreted as Unicode ordinals */
2656 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002657 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 continue;
2659 }
2660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 /* \ - Escapes */
2663 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002664 c = *s++;
2665 if (s > end)
2666 c = '\0'; /* Invalid after \ */
2667 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668
2669 /* \x escapes */
2670 case '\n': break;
2671 case '\\': *p++ = '\\'; break;
2672 case '\'': *p++ = '\''; break;
2673 case '\"': *p++ = '\"'; break;
2674 case 'b': *p++ = '\b'; break;
2675 case 'f': *p++ = '\014'; break; /* FF */
2676 case 't': *p++ = '\t'; break;
2677 case 'n': *p++ = '\n'; break;
2678 case 'r': *p++ = '\r'; break;
2679 case 'v': *p++ = '\013'; break; /* VT */
2680 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2681
2682 /* \OOO (octal) escapes */
2683 case '0': case '1': case '2': case '3':
2684 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002685 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002686 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002687 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002688 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002691 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 break;
2693
Fredrik Lundhccc74732001-02-18 22:13:49 +00002694 /* hex escapes */
2695 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002697 digits = 2;
2698 message = "truncated \\xXX escape";
2699 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002703 digits = 4;
2704 message = "truncated \\uXXXX escape";
2705 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002708 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002709 digits = 8;
2710 message = "truncated \\UXXXXXXXX escape";
2711 hexescape:
2712 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 outpos = p-PyUnicode_AS_UNICODE(v);
2714 if (s+digits>end) {
2715 endinpos = size;
2716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "unicodeescape", "end of string in escape sequence",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
2721 goto onError;
2722 goto nextByte;
2723 }
2724 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002725 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002726 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 endinpos = (s+i+1)-starts;
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "unicodeescape", message,
2731 starts, size, &startinpos, &endinpos, &exc, &s,
2732 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002733 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002735 }
2736 chr = (chr<<4) & ~0xF;
2737 if (c >= '0' && c <= '9')
2738 chr += c - '0';
2739 else if (c >= 'a' && c <= 'f')
2740 chr += 10 + c - 'a';
2741 else
2742 chr += 10 + c - 'A';
2743 }
2744 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002745 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 /* _decoding_error will have already written into the
2747 target buffer. */
2748 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002750 /* when we get here, chr is a 32-bit unicode character */
2751 if (chr <= 0xffff)
2752 /* UCS-2 character */
2753 *p++ = (Py_UNICODE) chr;
2754 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002755 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002756 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002757#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002758 *p++ = chr;
2759#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002760 chr -= 0x10000L;
2761 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002762 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002763#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002764 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 endinpos = s-starts;
2766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 if (unicode_decode_call_errorhandler(
2768 errors, &errorHandler,
2769 "unicodeescape", "illegal Unicode character",
2770 starts, size, &startinpos, &endinpos, &exc, &s,
2771 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002772 goto onError;
2773 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002774 break;
2775
2776 /* \N{name} */
2777 case 'N':
2778 message = "malformed \\N character escape";
2779 if (ucnhash_CAPI == NULL) {
2780 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002781 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002782 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002783 if (m == NULL)
2784 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002785 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002786 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002787 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002789 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002790 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 if (ucnhash_CAPI == NULL)
2792 goto ucnhashError;
2793 }
2794 if (*s == '{') {
2795 const char *start = s+1;
2796 /* look for the closing brace */
2797 while (*s != '}' && s < end)
2798 s++;
2799 if (s > start && s < end && *s == '}') {
2800 /* found a name. look it up in the unicode database */
2801 message = "unknown Unicode character name";
2802 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002803 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002804 goto store;
2805 }
2806 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 endinpos = s-starts;
2808 outpos = p-PyUnicode_AS_UNICODE(v);
2809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "unicodeescape", message,
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002815 break;
2816
2817 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002818 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819 message = "\\ at end of string";
2820 s--;
2821 endinpos = s-starts;
2822 outpos = p-PyUnicode_AS_UNICODE(v);
2823 if (unicode_decode_call_errorhandler(
2824 errors, &errorHandler,
2825 "unicodeescape", message,
2826 starts, size, &startinpos, &endinpos, &exc, &s,
2827 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002828 goto onError;
2829 }
2830 else {
2831 *p++ = '\\';
2832 *p++ = (unsigned char)s[-1];
2833 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002834 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 nextByte:
2837 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002839 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002841 Py_XDECREF(errorHandler);
2842 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002844
Fredrik Lundhccc74732001-02-18 22:13:49 +00002845ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002846 PyErr_SetString(
2847 PyExc_UnicodeError,
2848 "\\N escapes not supported (can't load unicodedata module)"
2849 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 Py_XDECREF(errorHandler);
2852 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002853 return NULL;
2854
Fredrik Lundhccc74732001-02-18 22:13:49 +00002855onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 Py_XDECREF(errorHandler);
2858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 return NULL;
2860}
2861
2862/* Return a Unicode-Escape string version of the Unicode object.
2863
2864 If quotes is true, the string is enclosed in u"" or u'' quotes as
2865 appropriate.
2866
2867*/
2868
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002869Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002870 Py_ssize_t size,
2871 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002872{
2873 /* like wcschr, but doesn't stop at NULL characters */
2874
2875 while (size-- > 0) {
2876 if (*s == ch)
2877 return s;
2878 s++;
2879 }
2880
2881 return NULL;
2882}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884static
2885PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002886 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 int quotes)
2888{
2889 PyObject *repr;
2890 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002892 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002893#ifdef Py_UNICODE_WIDE
2894 const Py_ssize_t expandsize = 10;
2895#else
2896 const Py_ssize_t expandsize = 6;
2897#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898
Neal Norwitz17753ec2006-08-21 22:21:19 +00002899 /* XXX(nnorwitz): rather than over-allocating, it would be
2900 better to choose a different scheme. Perhaps scan the
2901 first N-chars of the string and allocate based on that size.
2902 */
2903 /* Initial allocation is based on the longest-possible unichr
2904 escape.
2905
2906 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2907 unichr, so in this case it's the longest unichr escape. In
2908 narrow (UTF-16) builds this is five chars per source unichr
2909 since there are two unichrs in the surrogate pair, so in narrow
2910 (UTF-16) builds it's not the longest unichr escape.
2911
2912 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2913 so in the narrow (UTF-16) build case it's the longest unichr
2914 escape.
2915 */
2916
Neal Norwitze7d8be82008-07-31 17:17:14 +00002917 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2918 return PyErr_NoMemory();
2919
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002920 repr = PyString_FromStringAndSize(NULL,
Neal Norwitz17753ec2006-08-21 22:21:19 +00002921 2
Neal Norwitze7d8be82008-07-31 17:17:14 +00002922 + expandsize*size
Neal Norwitz17753ec2006-08-21 22:21:19 +00002923 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 if (repr == NULL)
2925 return NULL;
2926
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002927 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928
2929 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002931 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 !findchar(s, size, '"')) ? '"' : '\'';
2933 }
2934 while (size-- > 0) {
2935 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002936
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002937 /* Escape quotes and backslashes */
2938 if ((quotes &&
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002939 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 *p++ = '\\';
2941 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002942 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002943 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002944
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002945#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002946 /* Map 21-bit characters to '\U00xxxxxx' */
2947 else if (ch >= 0x10000) {
2948 *p++ = '\\';
2949 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002950 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2951 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2952 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2955 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2956 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002957 *p++ = hexdigit[ch & 0x0000000F];
2958 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002959 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002960#else
2961 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002962 else if (ch >= 0xD800 && ch < 0xDC00) {
2963 Py_UNICODE ch2;
2964 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002965
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002966 ch2 = *s++;
2967 size--;
2968 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2969 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2970 *p++ = '\\';
2971 *p++ = 'U';
2972 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2973 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2974 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2977 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2978 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2979 *p++ = hexdigit[ucs & 0x0000000F];
2980 continue;
2981 }
2982 /* Fall through: isolated surrogates are copied as-is */
2983 s--;
2984 size++;
2985 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002986#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002987
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002989 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 *p++ = '\\';
2991 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002992 *p++ = hexdigit[(ch >> 12) & 0x000F];
2993 *p++ = hexdigit[(ch >> 8) & 0x000F];
2994 *p++ = hexdigit[(ch >> 4) & 0x000F];
2995 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002998 /* Map special whitespace to '\t', \n', '\r' */
2999 else if (ch == '\t') {
3000 *p++ = '\\';
3001 *p++ = 't';
3002 }
3003 else if (ch == '\n') {
3004 *p++ = '\\';
3005 *p++ = 'n';
3006 }
3007 else if (ch == '\r') {
3008 *p++ = '\\';
3009 *p++ = 'r';
3010 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003011
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003012 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003013 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003015 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003016 *p++ = hexdigit[(ch >> 4) & 0x000F];
3017 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003018 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003019
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 /* Copy everything else as-is */
3021 else
3022 *p++ = (char) ch;
3023 }
3024 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003025 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026
3027 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003028 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 return repr;
3030}
3031
3032PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034{
3035 return unicodeescape_string(s, size, 0);
3036}
3037
3038PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3039{
3040 if (!PyUnicode_Check(unicode)) {
3041 PyErr_BadArgument();
3042 return NULL;
3043 }
3044 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3045 PyUnicode_GET_SIZE(unicode));
3046}
3047
3048/* --- Raw Unicode Escape Codec ------------------------------------------- */
3049
3050PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003051 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 const char *errors)
3053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 const char *end;
3061 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 PyObject *errorHandler = NULL;
3063 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 /* Escaped strings will always be longer than the resulting
3066 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 length after conversion to the true value. (But decoding error
3068 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 v = _PyUnicode_New(size);
3070 if (v == NULL)
3071 goto onError;
3072 if (size == 0)
3073 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 end = s + size;
3076 while (s < end) {
3077 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003078 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003080 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081
3082 /* Non-escape characters are interpreted as Unicode ordinals */
3083 if (*s != '\\') {
3084 *p++ = (unsigned char)*s++;
3085 continue;
3086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
3089 /* \u-escapes are only interpreted iff the number of leading
3090 backslashes if odd */
3091 bs = s;
3092 for (;s < end;) {
3093 if (*s != '\\')
3094 break;
3095 *p++ = (unsigned char)*s++;
3096 }
3097 if (((s - bs) & 1) == 0 ||
3098 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003099 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 continue;
3101 }
3102 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003103 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 s++;
3105
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003108 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 endinpos = s-starts;
3112 if (unicode_decode_call_errorhandler(
3113 errors, &errorHandler,
3114 "rawunicodeescape", "truncated \\uXXXX",
3115 starts, size, &startinpos, &endinpos, &exc, &s,
3116 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 }
3120 x = (x<<4) & ~0xF;
3121 if (c >= '0' && c <= '9')
3122 x += c - '0';
3123 else if (c >= 'a' && c <= 'f')
3124 x += 10 + c - 'a';
3125 else
3126 x += 10 + c - 'A';
3127 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003128 if (x <= 0xffff)
3129 /* UCS-2 character */
3130 *p++ = (Py_UNICODE) x;
3131 else if (x <= 0x10ffff) {
3132 /* UCS-4 character. Either store directly, or as
3133 surrogate pair. */
3134#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003135 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003136#else
3137 x -= 0x10000L;
3138 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3139 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3140#endif
3141 } else {
3142 endinpos = s-starts;
3143 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003144 if (unicode_decode_call_errorhandler(
3145 errors, &errorHandler,
3146 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3147 starts, size, &startinpos, &endinpos, &exc, &s,
3148 (PyObject **)&v, &outpos, &p))
3149 goto onError;
3150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 nextByte:
3152 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003154 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003155 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 Py_XDECREF(errorHandler);
3157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 onError:
3161 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 Py_XDECREF(errorHandler);
3163 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 return NULL;
3165}
3166
3167PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003168 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169{
3170 PyObject *repr;
3171 char *p;
3172 char *q;
3173
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003174 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003176 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003178 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00003180
3181 if (size > PY_SSIZE_T_MAX / expandsize)
3182 return PyErr_NoMemory();
3183
3184 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 if (repr == NULL)
3186 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003187 if (size == 0)
3188 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003190 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 while (size-- > 0) {
3192 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193#ifdef Py_UNICODE_WIDE
3194 /* Map 32-bit characters to '\Uxxxxxxxx' */
3195 if (ch >= 0x10000) {
3196 *p++ = '\\';
3197 *p++ = 'U';
3198 *p++ = hexdigit[(ch >> 28) & 0xf];
3199 *p++ = hexdigit[(ch >> 24) & 0xf];
3200 *p++ = hexdigit[(ch >> 20) & 0xf];
3201 *p++ = hexdigit[(ch >> 16) & 0xf];
3202 *p++ = hexdigit[(ch >> 12) & 0xf];
3203 *p++ = hexdigit[(ch >> 8) & 0xf];
3204 *p++ = hexdigit[(ch >> 4) & 0xf];
3205 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003206 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003207 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003208#else
3209 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3210 if (ch >= 0xD800 && ch < 0xDC00) {
3211 Py_UNICODE ch2;
3212 Py_UCS4 ucs;
3213
3214 ch2 = *s++;
3215 size--;
3216 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3217 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3218 *p++ = '\\';
3219 *p++ = 'U';
3220 *p++ = hexdigit[(ucs >> 28) & 0xf];
3221 *p++ = hexdigit[(ucs >> 24) & 0xf];
3222 *p++ = hexdigit[(ucs >> 20) & 0xf];
3223 *p++ = hexdigit[(ucs >> 16) & 0xf];
3224 *p++ = hexdigit[(ucs >> 12) & 0xf];
3225 *p++ = hexdigit[(ucs >> 8) & 0xf];
3226 *p++ = hexdigit[(ucs >> 4) & 0xf];
3227 *p++ = hexdigit[ucs & 0xf];
3228 continue;
3229 }
3230 /* Fall through: isolated surrogates are copied as-is */
3231 s--;
3232 size++;
3233 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 /* Map 16-bit characters to '\uxxxx' */
3236 if (ch >= 256) {
3237 *p++ = '\\';
3238 *p++ = 'u';
3239 *p++ = hexdigit[(ch >> 12) & 0xf];
3240 *p++ = hexdigit[(ch >> 8) & 0xf];
3241 *p++ = hexdigit[(ch >> 4) & 0xf];
3242 *p++ = hexdigit[ch & 15];
3243 }
3244 /* Copy everything else as-is */
3245 else
3246 *p++ = (char) ch;
3247 }
3248 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003249 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return repr;
3251}
3252
3253PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3254{
3255 if (!PyUnicode_Check(unicode)) {
3256 PyErr_BadArgument();
3257 return NULL;
3258 }
3259 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3260 PyUnicode_GET_SIZE(unicode));
3261}
3262
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003263/* --- Unicode Internal Codec ------------------------------------------- */
3264
3265PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003266 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003267 const char *errors)
3268{
3269 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t startinpos;
3271 Py_ssize_t endinpos;
3272 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003273 PyUnicodeObject *v;
3274 Py_UNICODE *p;
3275 const char *end;
3276 const char *reason;
3277 PyObject *errorHandler = NULL;
3278 PyObject *exc = NULL;
3279
Neal Norwitzd43069c2006-01-08 01:12:10 +00003280#ifdef Py_UNICODE_WIDE
3281 Py_UNICODE unimax = PyUnicode_GetMax();
3282#endif
3283
Armin Rigo7ccbca92006-10-04 12:17:45 +00003284 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003285 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3286 if (v == NULL)
3287 goto onError;
3288 if (PyUnicode_GetSize((PyObject *)v) == 0)
3289 return (PyObject *)v;
3290 p = PyUnicode_AS_UNICODE(v);
3291 end = s + size;
3292
3293 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003294 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003295 /* We have to sanity check the raw data, otherwise doom looms for
3296 some malformed UCS-4 data. */
3297 if (
3298 #ifdef Py_UNICODE_WIDE
3299 *p > unimax || *p < 0 ||
3300 #endif
3301 end-s < Py_UNICODE_SIZE
3302 )
3303 {
3304 startinpos = s - starts;
3305 if (end-s < Py_UNICODE_SIZE) {
3306 endinpos = end-starts;
3307 reason = "truncated input";
3308 }
3309 else {
3310 endinpos = s - starts + Py_UNICODE_SIZE;
3311 reason = "illegal code point (> 0x10FFFF)";
3312 }
3313 outpos = p - PyUnicode_AS_UNICODE(v);
3314 if (unicode_decode_call_errorhandler(
3315 errors, &errorHandler,
3316 "unicode_internal", reason,
3317 starts, size, &startinpos, &endinpos, &exc, &s,
3318 (PyObject **)&v, &outpos, &p)) {
3319 goto onError;
3320 }
3321 }
3322 else {
3323 p++;
3324 s += Py_UNICODE_SIZE;
3325 }
3326 }
3327
Martin v. Löwis412fb672006-04-13 06:34:32 +00003328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003329 goto onError;
3330 Py_XDECREF(errorHandler);
3331 Py_XDECREF(exc);
3332 return (PyObject *)v;
3333
3334 onError:
3335 Py_XDECREF(v);
3336 Py_XDECREF(errorHandler);
3337 Py_XDECREF(exc);
3338 return NULL;
3339}
3340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341/* --- Latin-1 Codec ------------------------------------------------------ */
3342
3343PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 const char *errors)
3346{
3347 PyUnicodeObject *v;
3348 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003351 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003352 Py_UNICODE r = *(unsigned char*)s;
3353 return PyUnicode_FromUnicode(&r, 1);
3354 }
3355
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 v = _PyUnicode_New(size);
3357 if (v == NULL)
3358 goto onError;
3359 if (size == 0)
3360 return (PyObject *)v;
3361 p = PyUnicode_AS_UNICODE(v);
3362 while (size-- > 0)
3363 *p++ = (unsigned char)*s++;
3364 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 onError:
3367 Py_XDECREF(v);
3368 return NULL;
3369}
3370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371/* create or adjust a UnicodeEncodeError */
3372static void make_encode_exception(PyObject **exceptionObject,
3373 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 const Py_UNICODE *unicode, Py_ssize_t size,
3375 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 if (*exceptionObject == NULL) {
3379 *exceptionObject = PyUnicodeEncodeError_Create(
3380 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 }
3382 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3384 goto onError;
3385 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3386 goto onError;
3387 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3388 goto onError;
3389 return;
3390 onError:
3391 Py_DECREF(*exceptionObject);
3392 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 }
3394}
3395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396/* raises a UnicodeEncodeError */
3397static void raise_encode_exception(PyObject **exceptionObject,
3398 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003399 const Py_UNICODE *unicode, Py_ssize_t size,
3400 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 const char *reason)
3402{
3403 make_encode_exception(exceptionObject,
3404 encoding, unicode, size, startpos, endpos, reason);
3405 if (*exceptionObject != NULL)
3406 PyCodec_StrictErrors(*exceptionObject);
3407}
3408
3409/* error handling callback helper:
3410 build arguments, call the callback and check the arguments,
3411 put the result into newpos and return the replacement string, which
3412 has to be freed by the caller */
3413static PyObject *unicode_encode_call_errorhandler(const char *errors,
3414 PyObject **errorHandler,
3415 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003416 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3417 Py_ssize_t startpos, Py_ssize_t endpos,
3418 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003420 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421
3422 PyObject *restuple;
3423 PyObject *resunicode;
3424
3425 if (*errorHandler == NULL) {
3426 *errorHandler = PyCodec_LookupError(errors);
3427 if (*errorHandler == NULL)
3428 return NULL;
3429 }
3430
3431 make_encode_exception(exceptionObject,
3432 encoding, unicode, size, startpos, endpos, reason);
3433 if (*exceptionObject == NULL)
3434 return NULL;
3435
3436 restuple = PyObject_CallFunctionObjArgs(
3437 *errorHandler, *exceptionObject, NULL);
3438 if (restuple == NULL)
3439 return NULL;
3440 if (!PyTuple_Check(restuple)) {
3441 PyErr_Format(PyExc_TypeError, &argparse[4]);
3442 Py_DECREF(restuple);
3443 return NULL;
3444 }
3445 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3446 &resunicode, newpos)) {
3447 Py_DECREF(restuple);
3448 return NULL;
3449 }
3450 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003451 *newpos = size+*newpos;
3452 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003453 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003454 Py_DECREF(restuple);
3455 return NULL;
3456 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 Py_INCREF(resunicode);
3458 Py_DECREF(restuple);
3459 return resunicode;
3460}
3461
3462static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003463 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 const char *errors,
3465 int limit)
3466{
3467 /* output object */
3468 PyObject *res;
3469 /* pointers to the beginning and end+1 of input */
3470 const Py_UNICODE *startp = p;
3471 const Py_UNICODE *endp = p + size;
3472 /* pointer to the beginning of the unencodable characters */
3473 /* const Py_UNICODE *badp = NULL; */
3474 /* pointer into the output */
3475 char *str;
3476 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t respos = 0;
3478 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003479 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3480 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 PyObject *errorHandler = NULL;
3482 PyObject *exc = NULL;
3483 /* the following variable is used for caching string comparisons
3484 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3485 int known_errorHandler = -1;
3486
3487 /* allocate enough for a simple encoding without
3488 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003489 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 if (res == NULL)
3491 goto onError;
3492 if (size == 0)
3493 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003494 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 ressize = size;
3496
3497 while (p<endp) {
3498 Py_UNICODE c = *p;
3499
3500 /* can we encode this? */
3501 if (c<limit) {
3502 /* no overflow check, because we know that the space is enough */
3503 *str++ = (char)c;
3504 ++p;
3505 }
3506 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003507 Py_ssize_t unicodepos = p-startp;
3508 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003510 Py_ssize_t repsize;
3511 Py_ssize_t newpos;
3512 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 Py_UNICODE *uni2;
3514 /* startpos for collecting unencodable chars */
3515 const Py_UNICODE *collstart = p;
3516 const Py_UNICODE *collend = p;
3517 /* find all unecodable characters */
3518 while ((collend < endp) && ((*collend)>=limit))
3519 ++collend;
3520 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3521 if (known_errorHandler==-1) {
3522 if ((errors==NULL) || (!strcmp(errors, "strict")))
3523 known_errorHandler = 1;
3524 else if (!strcmp(errors, "replace"))
3525 known_errorHandler = 2;
3526 else if (!strcmp(errors, "ignore"))
3527 known_errorHandler = 3;
3528 else if (!strcmp(errors, "xmlcharrefreplace"))
3529 known_errorHandler = 4;
3530 else
3531 known_errorHandler = 0;
3532 }
3533 switch (known_errorHandler) {
3534 case 1: /* strict */
3535 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3536 goto onError;
3537 case 2: /* replace */
3538 while (collstart++<collend)
3539 *str++ = '?'; /* fall through */
3540 case 3: /* ignore */
3541 p = collend;
3542 break;
3543 case 4: /* xmlcharrefreplace */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003544 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 /* determine replacement size (temporarily (mis)uses p) */
3546 for (p = collstart, repsize = 0; p < collend; ++p) {
3547 if (*p<10)
3548 repsize += 2+1+1;
3549 else if (*p<100)
3550 repsize += 2+2+1;
3551 else if (*p<1000)
3552 repsize += 2+3+1;
3553 else if (*p<10000)
3554 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003555#ifndef Py_UNICODE_WIDE
3556 else
3557 repsize += 2+5+1;
3558#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 else if (*p<100000)
3560 repsize += 2+5+1;
3561 else if (*p<1000000)
3562 repsize += 2+6+1;
3563 else
3564 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003565#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 }
3567 requiredsize = respos+repsize+(endp-collend);
3568 if (requiredsize > ressize) {
3569 if (requiredsize<2*ressize)
3570 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003571 if (_PyString_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003573 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 ressize = requiredsize;
3575 }
3576 /* generate replacement (temporarily (mis)uses p) */
3577 for (p = collstart; p < collend; ++p) {
3578 str += sprintf(str, "&#%d;", (int)*p);
3579 }
3580 p = collend;
3581 break;
3582 default:
3583 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3584 encoding, reason, startp, size, &exc,
3585 collstart-startp, collend-startp, &newpos);
3586 if (repunicode == NULL)
3587 goto onError;
3588 /* need more space? (at least enough for what we
3589 have+the replacement+the rest of the string, so
3590 we won't have to check space for encodable characters) */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003591 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 repsize = PyUnicode_GET_SIZE(repunicode);
3593 requiredsize = respos+repsize+(endp-collend);
3594 if (requiredsize > ressize) {
3595 if (requiredsize<2*ressize)
3596 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003597 if (_PyString_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 Py_DECREF(repunicode);
3599 goto onError;
3600 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003601 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 ressize = requiredsize;
3603 }
3604 /* check if there is anything unencodable in the replacement
3605 and copy it to the output */
3606 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3607 c = *uni2;
3608 if (c >= limit) {
3609 raise_encode_exception(&exc, encoding, startp, size,
3610 unicodepos, unicodepos+1, reason);
3611 Py_DECREF(repunicode);
3612 goto onError;
3613 }
3614 *str = (char)c;
3615 }
3616 p = startp + newpos;
3617 Py_DECREF(repunicode);
3618 }
3619 }
3620 }
3621 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003622 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 if (respos<ressize)
3624 /* If this falls res will be NULL */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003625 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 Py_XDECREF(errorHandler);
3627 Py_XDECREF(exc);
3628 return res;
3629
3630 onError:
3631 Py_XDECREF(res);
3632 Py_XDECREF(errorHandler);
3633 Py_XDECREF(exc);
3634 return NULL;
3635}
3636
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003638 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 const char *errors)
3640{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642}
3643
3644PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3645{
3646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
3648 return NULL;
3649 }
3650 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3651 PyUnicode_GET_SIZE(unicode),
3652 NULL);
3653}
3654
3655/* --- 7-bit ASCII Codec -------------------------------------------------- */
3656
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003658 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 const char *errors)
3660{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 PyUnicodeObject *v;
3663 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t startinpos;
3665 Py_ssize_t endinpos;
3666 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 const char *e;
3668 PyObject *errorHandler = NULL;
3669 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003672 if (size == 1 && *(unsigned char*)s < 128) {
3673 Py_UNICODE r = *(unsigned char*)s;
3674 return PyUnicode_FromUnicode(&r, 1);
3675 }
Tim Petersced69f82003-09-16 20:30:58 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 v = _PyUnicode_New(size);
3678 if (v == NULL)
3679 goto onError;
3680 if (size == 0)
3681 return (PyObject *)v;
3682 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 e = s + size;
3684 while (s < e) {
3685 register unsigned char c = (unsigned char)*s;
3686 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 ++s;
3689 }
3690 else {
3691 startinpos = s-starts;
3692 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003693 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 if (unicode_decode_call_errorhandler(
3695 errors, &errorHandler,
3696 "ascii", "ordinal not in range(128)",
3697 starts, size, &startinpos, &endinpos, &exc, &s,
3698 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003702 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003703 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003704 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 Py_XDECREF(errorHandler);
3706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 onError:
3710 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 Py_XDECREF(errorHandler);
3712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 return NULL;
3714}
3715
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 const char *errors)
3719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721}
3722
3723PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3724{
3725 if (!PyUnicode_Check(unicode)) {
3726 PyErr_BadArgument();
3727 return NULL;
3728 }
3729 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3730 PyUnicode_GET_SIZE(unicode),
3731 NULL);
3732}
3733
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003734#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003735
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003736/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003737
Martin v. Löwisd8251432006-06-14 05:21:04 +00003738#if SIZEOF_INT < SIZEOF_SSIZE_T
3739#define NEED_RETRY
3740#endif
3741
3742/* XXX This code is limited to "true" double-byte encodings, as
3743 a) it assumes an incomplete character consists of a single byte, and
3744 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3745 encodings, see IsDBCSLeadByteEx documentation. */
3746
3747static int is_dbcs_lead_byte(const char *s, int offset)
3748{
3749 const char *curr = s + offset;
3750
3751 if (IsDBCSLeadByte(*curr)) {
3752 const char *prev = CharPrev(s, curr);
3753 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3754 }
3755 return 0;
3756}
3757
3758/*
3759 * Decode MBCS string into unicode object. If 'final' is set, converts
3760 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3761 */
3762static int decode_mbcs(PyUnicodeObject **v,
3763 const char *s, /* MBCS string */
3764 int size, /* sizeof MBCS string */
3765 int final)
3766{
3767 Py_UNICODE *p;
3768 Py_ssize_t n = 0;
3769 int usize = 0;
3770
3771 assert(size >= 0);
3772
3773 /* Skip trailing lead-byte unless 'final' is set */
3774 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3775 --size;
3776
3777 /* First get the size of the result */
3778 if (size > 0) {
3779 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3780 if (usize == 0) {
3781 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3782 return -1;
3783 }
3784 }
3785
3786 if (*v == NULL) {
3787 /* Create unicode object */
3788 *v = _PyUnicode_New(usize);
3789 if (*v == NULL)
3790 return -1;
3791 }
3792 else {
3793 /* Extend unicode object */
3794 n = PyUnicode_GET_SIZE(*v);
3795 if (_PyUnicode_Resize(v, n + usize) < 0)
3796 return -1;
3797 }
3798
3799 /* Do the conversion */
3800 if (size > 0) {
3801 p = PyUnicode_AS_UNICODE(*v) + n;
3802 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3803 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3804 return -1;
3805 }
3806 }
3807
3808 return size;
3809}
3810
3811PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3812 Py_ssize_t size,
3813 const char *errors,
3814 Py_ssize_t *consumed)
3815{
3816 PyUnicodeObject *v = NULL;
3817 int done;
3818
3819 if (consumed)
3820 *consumed = 0;
3821
3822#ifdef NEED_RETRY
3823 retry:
3824 if (size > INT_MAX)
3825 done = decode_mbcs(&v, s, INT_MAX, 0);
3826 else
3827#endif
3828 done = decode_mbcs(&v, s, (int)size, !consumed);
3829
3830 if (done < 0) {
3831 Py_XDECREF(v);
3832 return NULL;
3833 }
3834
3835 if (consumed)
3836 *consumed += done;
3837
3838#ifdef NEED_RETRY
3839 if (size > INT_MAX) {
3840 s += done;
3841 size -= done;
3842 goto retry;
3843 }
3844#endif
3845
3846 return (PyObject *)v;
3847}
3848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003849PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003850 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003851 const char *errors)
3852{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003853 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3854}
3855
3856/*
3857 * Convert unicode into string object (MBCS).
3858 * Returns 0 if succeed, -1 otherwise.
3859 */
3860static int encode_mbcs(PyObject **repr,
3861 const Py_UNICODE *p, /* unicode */
3862 int size) /* size of unicode */
3863{
3864 int mbcssize = 0;
3865 Py_ssize_t n = 0;
3866
3867 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003868
3869 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003870 if (size > 0) {
3871 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3872 if (mbcssize == 0) {
3873 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3874 return -1;
3875 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003876 }
3877
Martin v. Löwisd8251432006-06-14 05:21:04 +00003878 if (*repr == NULL) {
3879 /* Create string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003880 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003881 if (*repr == NULL)
3882 return -1;
3883 }
3884 else {
3885 /* Extend string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003886 n = PyString_Size(*repr);
3887 if (_PyString_Resize(repr, n + mbcssize) < 0)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 return -1;
3889 }
3890
3891 /* Do the conversion */
3892 if (size > 0) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003893 char *s = PyString_AS_STRING(*repr) + n;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003894 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3895 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3896 return -1;
3897 }
3898 }
3899
3900 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003901}
3902
3903PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003905 const char *errors)
3906{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 PyObject *repr = NULL;
3908 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003909
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910#ifdef NEED_RETRY
3911 retry:
3912 if (size > INT_MAX)
3913 ret = encode_mbcs(&repr, p, INT_MAX);
3914 else
3915#endif
3916 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003917
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918 if (ret < 0) {
3919 Py_XDECREF(repr);
3920 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003921 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922
3923#ifdef NEED_RETRY
3924 if (size > INT_MAX) {
3925 p += INT_MAX;
3926 size -= INT_MAX;
3927 goto retry;
3928 }
3929#endif
3930
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003931 return repr;
3932}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003933
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003934PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3935{
3936 if (!PyUnicode_Check(unicode)) {
3937 PyErr_BadArgument();
3938 return NULL;
3939 }
3940 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3941 PyUnicode_GET_SIZE(unicode),
3942 NULL);
3943}
3944
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945#undef NEED_RETRY
3946
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003947#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003948
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949/* --- Character Mapping Codec -------------------------------------------- */
3950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *mapping,
3954 const char *errors)
3955{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t startinpos;
3958 Py_ssize_t endinpos;
3959 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 PyUnicodeObject *v;
3962 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 PyObject *errorHandler = NULL;
3965 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003966 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003967 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003968
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 /* Default to Latin-1 */
3970 if (mapping == NULL)
3971 return PyUnicode_DecodeLatin1(s, size, errors);
3972
3973 v = _PyUnicode_New(size);
3974 if (v == NULL)
3975 goto onError;
3976 if (size == 0)
3977 return (PyObject *)v;
3978 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003980 if (PyUnicode_CheckExact(mapping)) {
3981 mapstring = PyUnicode_AS_UNICODE(mapping);
3982 maplen = PyUnicode_GET_SIZE(mapping);
3983 while (s < e) {
3984 unsigned char ch = *s;
3985 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003987 if (ch < maplen)
3988 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003990 if (x == 0xfffe) {
3991 /* undefined mapping */
3992 outpos = p-PyUnicode_AS_UNICODE(v);
3993 startinpos = s-starts;
3994 endinpos = startinpos+1;
3995 if (unicode_decode_call_errorhandler(
3996 errors, &errorHandler,
3997 "charmap", "character maps to <undefined>",
3998 starts, size, &startinpos, &endinpos, &exc, &s,
3999 (PyObject **)&v, &outpos, &p)) {
4000 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004001 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004002 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004003 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004004 *p++ = x;
4005 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004007 }
4008 else {
4009 while (s < e) {
4010 unsigned char ch = *s;
4011 PyObject *w, *x;
4012
4013 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4014 w = PyInt_FromLong((long)ch);
4015 if (w == NULL)
4016 goto onError;
4017 x = PyObject_GetItem(mapping, w);
4018 Py_DECREF(w);
4019 if (x == NULL) {
4020 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4021 /* No mapping found means: mapping is undefined. */
4022 PyErr_Clear();
4023 x = Py_None;
4024 Py_INCREF(x);
4025 } else
4026 goto onError;
4027 }
4028
4029 /* Apply mapping */
4030 if (PyInt_Check(x)) {
4031 long value = PyInt_AS_LONG(x);
4032 if (value < 0 || value > 65535) {
4033 PyErr_SetString(PyExc_TypeError,
4034 "character mapping must be in range(65536)");
4035 Py_DECREF(x);
4036 goto onError;
4037 }
4038 *p++ = (Py_UNICODE)value;
4039 }
4040 else if (x == Py_None) {
4041 /* undefined mapping */
4042 outpos = p-PyUnicode_AS_UNICODE(v);
4043 startinpos = s-starts;
4044 endinpos = startinpos+1;
4045 if (unicode_decode_call_errorhandler(
4046 errors, &errorHandler,
4047 "charmap", "character maps to <undefined>",
4048 starts, size, &startinpos, &endinpos, &exc, &s,
4049 (PyObject **)&v, &outpos, &p)) {
4050 Py_DECREF(x);
4051 goto onError;
4052 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004053 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004054 continue;
4055 }
4056 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004058
4059 if (targetsize == 1)
4060 /* 1-1 mapping */
4061 *p++ = *PyUnicode_AS_UNICODE(x);
4062
4063 else if (targetsize > 1) {
4064 /* 1-n mapping */
4065 if (targetsize > extrachars) {
4066 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4068 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004069 (targetsize << 2);
4070 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004071 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004072 if (_PyUnicode_Resize(&v,
4073 PyUnicode_GET_SIZE(v) + needed) < 0) {
4074 Py_DECREF(x);
4075 goto onError;
4076 }
4077 p = PyUnicode_AS_UNICODE(v) + oldpos;
4078 }
4079 Py_UNICODE_COPY(p,
4080 PyUnicode_AS_UNICODE(x),
4081 targetsize);
4082 p += targetsize;
4083 extrachars -= targetsize;
4084 }
4085 /* 1-0 mapping: skip the character */
4086 }
4087 else {
4088 /* wrong return value */
4089 PyErr_SetString(PyExc_TypeError,
4090 "character mapping must return integer, None or unicode");
4091 Py_DECREF(x);
4092 goto onError;
4093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004095 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 }
4098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 Py_XDECREF(errorHandler);
4102 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004104
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 Py_XDECREF(errorHandler);
4107 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 Py_XDECREF(v);
4109 return NULL;
4110}
4111
Martin v. Löwis3f767792006-06-04 19:36:28 +00004112/* Charmap encoding: the lookup table */
4113
4114struct encoding_map{
4115 PyObject_HEAD
4116 unsigned char level1[32];
4117 int count2, count3;
4118 unsigned char level23[1];
4119};
4120
4121static PyObject*
4122encoding_map_size(PyObject *obj, PyObject* args)
4123{
4124 struct encoding_map *map = (struct encoding_map*)obj;
4125 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4126 128*map->count3);
4127}
4128
4129static PyMethodDef encoding_map_methods[] = {
4130 {"size", encoding_map_size, METH_NOARGS,
4131 PyDoc_STR("Return the size (in bytes) of this object") },
4132 { 0 }
4133};
4134
4135static void
4136encoding_map_dealloc(PyObject* o)
4137{
4138 PyObject_FREE(o);
4139}
4140
4141static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004142 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004143 "EncodingMap", /*tp_name*/
4144 sizeof(struct encoding_map), /*tp_basicsize*/
4145 0, /*tp_itemsize*/
4146 /* methods */
4147 encoding_map_dealloc, /*tp_dealloc*/
4148 0, /*tp_print*/
4149 0, /*tp_getattr*/
4150 0, /*tp_setattr*/
4151 0, /*tp_compare*/
4152 0, /*tp_repr*/
4153 0, /*tp_as_number*/
4154 0, /*tp_as_sequence*/
4155 0, /*tp_as_mapping*/
4156 0, /*tp_hash*/
4157 0, /*tp_call*/
4158 0, /*tp_str*/
4159 0, /*tp_getattro*/
4160 0, /*tp_setattro*/
4161 0, /*tp_as_buffer*/
4162 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4163 0, /*tp_doc*/
4164 0, /*tp_traverse*/
4165 0, /*tp_clear*/
4166 0, /*tp_richcompare*/
4167 0, /*tp_weaklistoffset*/
4168 0, /*tp_iter*/
4169 0, /*tp_iternext*/
4170 encoding_map_methods, /*tp_methods*/
4171 0, /*tp_members*/
4172 0, /*tp_getset*/
4173 0, /*tp_base*/
4174 0, /*tp_dict*/
4175 0, /*tp_descr_get*/
4176 0, /*tp_descr_set*/
4177 0, /*tp_dictoffset*/
4178 0, /*tp_init*/
4179 0, /*tp_alloc*/
4180 0, /*tp_new*/
4181 0, /*tp_free*/
4182 0, /*tp_is_gc*/
4183};
4184
4185PyObject*
4186PyUnicode_BuildEncodingMap(PyObject* string)
4187{
4188 Py_UNICODE *decode;
4189 PyObject *result;
4190 struct encoding_map *mresult;
4191 int i;
4192 int need_dict = 0;
4193 unsigned char level1[32];
4194 unsigned char level2[512];
4195 unsigned char *mlevel1, *mlevel2, *mlevel3;
4196 int count2 = 0, count3 = 0;
4197
4198 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4199 PyErr_BadArgument();
4200 return NULL;
4201 }
4202 decode = PyUnicode_AS_UNICODE(string);
4203 memset(level1, 0xFF, sizeof level1);
4204 memset(level2, 0xFF, sizeof level2);
4205
4206 /* If there isn't a one-to-one mapping of NULL to \0,
4207 or if there are non-BMP characters, we need to use
4208 a mapping dictionary. */
4209 if (decode[0] != 0)
4210 need_dict = 1;
4211 for (i = 1; i < 256; i++) {
4212 int l1, l2;
4213 if (decode[i] == 0
4214 #ifdef Py_UNICODE_WIDE
4215 || decode[i] > 0xFFFF
4216 #endif
4217 ) {
4218 need_dict = 1;
4219 break;
4220 }
4221 if (decode[i] == 0xFFFE)
4222 /* unmapped character */
4223 continue;
4224 l1 = decode[i] >> 11;
4225 l2 = decode[i] >> 7;
4226 if (level1[l1] == 0xFF)
4227 level1[l1] = count2++;
4228 if (level2[l2] == 0xFF)
4229 level2[l2] = count3++;
4230 }
4231
4232 if (count2 >= 0xFF || count3 >= 0xFF)
4233 need_dict = 1;
4234
4235 if (need_dict) {
4236 PyObject *result = PyDict_New();
4237 PyObject *key, *value;
4238 if (!result)
4239 return NULL;
4240 for (i = 0; i < 256; i++) {
4241 key = value = NULL;
4242 key = PyInt_FromLong(decode[i]);
4243 value = PyInt_FromLong(i);
4244 if (!key || !value)
4245 goto failed1;
4246 if (PyDict_SetItem(result, key, value) == -1)
4247 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004248 Py_DECREF(key);
4249 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004250 }
4251 return result;
4252 failed1:
4253 Py_XDECREF(key);
4254 Py_XDECREF(value);
4255 Py_DECREF(result);
4256 return NULL;
4257 }
4258
4259 /* Create a three-level trie */
4260 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4261 16*count2 + 128*count3 - 1);
4262 if (!result)
4263 return PyErr_NoMemory();
4264 PyObject_Init(result, &EncodingMapType);
4265 mresult = (struct encoding_map*)result;
4266 mresult->count2 = count2;
4267 mresult->count3 = count3;
4268 mlevel1 = mresult->level1;
4269 mlevel2 = mresult->level23;
4270 mlevel3 = mresult->level23 + 16*count2;
4271 memcpy(mlevel1, level1, 32);
4272 memset(mlevel2, 0xFF, 16*count2);
4273 memset(mlevel3, 0, 128*count3);
4274 count3 = 0;
4275 for (i = 1; i < 256; i++) {
4276 int o1, o2, o3, i2, i3;
4277 if (decode[i] == 0xFFFE)
4278 /* unmapped character */
4279 continue;
4280 o1 = decode[i]>>11;
4281 o2 = (decode[i]>>7) & 0xF;
4282 i2 = 16*mlevel1[o1] + o2;
4283 if (mlevel2[i2] == 0xFF)
4284 mlevel2[i2] = count3++;
4285 o3 = decode[i] & 0x7F;
4286 i3 = 128*mlevel2[i2] + o3;
4287 mlevel3[i3] = i;
4288 }
4289 return result;
4290}
4291
4292static int
4293encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4294{
4295 struct encoding_map *map = (struct encoding_map*)mapping;
4296 int l1 = c>>11;
4297 int l2 = (c>>7) & 0xF;
4298 int l3 = c & 0x7F;
4299 int i;
4300
4301#ifdef Py_UNICODE_WIDE
4302 if (c > 0xFFFF) {
4303 return -1;
4304 }
4305#endif
4306 if (c == 0)
4307 return 0;
4308 /* level 1*/
4309 i = map->level1[l1];
4310 if (i == 0xFF) {
4311 return -1;
4312 }
4313 /* level 2*/
4314 i = map->level23[16*i+l2];
4315 if (i == 0xFF) {
4316 return -1;
4317 }
4318 /* level 3 */
4319 i = map->level23[16*map->count2 + 128*i + l3];
4320 if (i == 0) {
4321 return -1;
4322 }
4323 return i;
4324}
4325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326/* Lookup the character ch in the mapping. If the character
4327 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004328 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 PyObject *w = PyInt_FromLong((long)c);
4332 PyObject *x;
4333
4334 if (w == NULL)
4335 return NULL;
4336 x = PyObject_GetItem(mapping, w);
4337 Py_DECREF(w);
4338 if (x == NULL) {
4339 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4340 /* No mapping found means: mapping is undefined. */
4341 PyErr_Clear();
4342 x = Py_None;
4343 Py_INCREF(x);
4344 return x;
4345 } else
4346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004348 else if (x == Py_None)
4349 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 else if (PyInt_Check(x)) {
4351 long value = PyInt_AS_LONG(x);
4352 if (value < 0 || value > 255) {
4353 PyErr_SetString(PyExc_TypeError,
4354 "character mapping must be in range(256)");
4355 Py_DECREF(x);
4356 return NULL;
4357 }
4358 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004360 else if (PyString_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 /* wrong return value */
4364 PyErr_SetString(PyExc_TypeError,
4365 "character mapping must return integer, None or str");
4366 Py_DECREF(x);
4367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
4369}
4370
Martin v. Löwis3f767792006-06-04 19:36:28 +00004371static int
4372charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4373{
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004374 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375 /* exponentially overallocate to minimize reallocations */
4376 if (requiredsize < 2*outsize)
4377 requiredsize = 2*outsize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004378 if (_PyString_Resize(outobj, requiredsize)) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004379 return 0;
4380 }
4381 return 1;
4382}
4383
4384typedef enum charmapencode_result {
4385 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4386}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387/* lookup the character, put the result in the output string and adjust
4388 various state variables. Reallocate the output string if not enough
4389 space is available. Return a new reference to the object that
4390 was put in the output buffer, or Py_None, if the mapping was undefined
4391 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004392 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004394charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004395 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004397 PyObject *rep;
4398 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004399 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400
Christian Heimese93237d2007-12-19 02:37:44 +00004401 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004402 int res = encoding_map_lookup(c, mapping);
4403 Py_ssize_t requiredsize = *outpos+1;
4404 if (res == -1)
4405 return enc_FAILED;
4406 if (outsize<requiredsize)
4407 if (!charmapencode_resize(outobj, outpos, requiredsize))
4408 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004409 outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004410 outstart[(*outpos)++] = (char)res;
4411 return enc_SUCCESS;
4412 }
4413
4414 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004416 return enc_EXCEPTION;
4417 else if (rep==Py_None) {
4418 Py_DECREF(rep);
4419 return enc_FAILED;
4420 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004423 if (outsize<requiredsize)
4424 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004426 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004428 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4430 }
4431 else {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004432 const char *repchars = PyString_AS_STRING(rep);
4433 Py_ssize_t repsize = PyString_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004435 if (outsize<requiredsize)
4436 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004438 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004440 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 memcpy(outstart + *outpos, repchars, repsize);
4442 *outpos += repsize;
4443 }
4444 }
Georg Brandl9f167602006-06-04 21:46:16 +00004445 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004446 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447}
4448
4449/* handle an error in PyUnicode_EncodeCharmap
4450 Return 0 on success, -1 on error */
4451static
4452int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004453 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004455 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004456 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457{
4458 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004459 Py_ssize_t repsize;
4460 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 Py_UNICODE *uni2;
4462 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004463 Py_ssize_t collstartpos = *inpos;
4464 Py_ssize_t collendpos = *inpos+1;
4465 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 char *encoding = "charmap";
4467 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004468 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 /* find all unencodable characters */
4471 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004473 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004474 int res = encoding_map_lookup(p[collendpos], mapping);
4475 if (res != -1)
4476 break;
4477 ++collendpos;
4478 continue;
4479 }
4480
4481 rep = charmapencode_lookup(p[collendpos], mapping);
4482 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004484 else if (rep!=Py_None) {
4485 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 break;
4487 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004488 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 ++collendpos;
4490 }
4491 /* cache callback name lookup
4492 * (if not done yet, i.e. it's the first error) */
4493 if (*known_errorHandler==-1) {
4494 if ((errors==NULL) || (!strcmp(errors, "strict")))
4495 *known_errorHandler = 1;
4496 else if (!strcmp(errors, "replace"))
4497 *known_errorHandler = 2;
4498 else if (!strcmp(errors, "ignore"))
4499 *known_errorHandler = 3;
4500 else if (!strcmp(errors, "xmlcharrefreplace"))
4501 *known_errorHandler = 4;
4502 else
4503 *known_errorHandler = 0;
4504 }
4505 switch (*known_errorHandler) {
4506 case 1: /* strict */
4507 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4508 return -1;
4509 case 2: /* replace */
4510 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4511 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 return -1;
4514 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004515 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4517 return -1;
4518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 }
4520 /* fall through */
4521 case 3: /* ignore */
4522 *inpos = collendpos;
4523 break;
4524 case 4: /* xmlcharrefreplace */
4525 /* generate replacement (temporarily (mis)uses p) */
4526 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4527 char buffer[2+29+1+1];
4528 char *cp;
4529 sprintf(buffer, "&#%d;", (int)p[collpos]);
4530 for (cp = buffer; *cp; ++cp) {
4531 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004532 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004534 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4536 return -1;
4537 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 }
4539 }
4540 *inpos = collendpos;
4541 break;
4542 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004543 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 encoding, reason, p, size, exceptionObject,
4545 collstartpos, collendpos, &newpos);
4546 if (repunicode == NULL)
4547 return -1;
4548 /* generate replacement */
4549 repsize = PyUnicode_GET_SIZE(repunicode);
4550 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4551 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004552 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 return -1;
4554 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004555 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4558 return -1;
4559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 }
4561 *inpos = newpos;
4562 Py_DECREF(repunicode);
4563 }
4564 return 0;
4565}
4566
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 PyObject *mapping,
4570 const char *errors)
4571{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 /* output object */
4573 PyObject *res = NULL;
4574 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 PyObject *errorHandler = NULL;
4579 PyObject *exc = NULL;
4580 /* the following variable is used for caching string comparisons
4581 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4582 * 3=ignore, 4=xmlcharrefreplace */
4583 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584
4585 /* Default to Latin-1 */
4586 if (mapping == NULL)
4587 return PyUnicode_EncodeLatin1(p, size, errors);
4588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 /* allocate enough for a simple encoding without
4590 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004591 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 if (res == NULL)
4593 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004594 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 while (inpos<size) {
4598 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004599 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4600 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004602 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 if (charmap_encoding_error(p, size, &inpos, mapping,
4604 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004605 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004606 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004607 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 else
4611 /* done with this character => adjust input position */
4612 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004616 if (respos<PyString_GET_SIZE(res)) {
4617 if (_PyString_Resize(&res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 goto onError;
4619 }
4620 Py_XDECREF(exc);
4621 Py_XDECREF(errorHandler);
4622 return res;
4623
4624 onError:
4625 Py_XDECREF(res);
4626 Py_XDECREF(exc);
4627 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 return NULL;
4629}
4630
4631PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4632 PyObject *mapping)
4633{
4634 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4635 PyErr_BadArgument();
4636 return NULL;
4637 }
4638 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4639 PyUnicode_GET_SIZE(unicode),
4640 mapping,
4641 NULL);
4642}
4643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644/* create or adjust a UnicodeTranslateError */
4645static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 const Py_UNICODE *unicode, Py_ssize_t size,
4647 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 if (*exceptionObject == NULL) {
4651 *exceptionObject = PyUnicodeTranslateError_Create(
4652 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 }
4654 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4656 goto onError;
4657 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4658 goto onError;
4659 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4660 goto onError;
4661 return;
4662 onError:
4663 Py_DECREF(*exceptionObject);
4664 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 }
4666}
4667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668/* raises a UnicodeTranslateError */
4669static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 const Py_UNICODE *unicode, Py_ssize_t size,
4671 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 const char *reason)
4673{
4674 make_translate_exception(exceptionObject,
4675 unicode, size, startpos, endpos, reason);
4676 if (*exceptionObject != NULL)
4677 PyCodec_StrictErrors(*exceptionObject);
4678}
4679
4680/* error handling callback helper:
4681 build arguments, call the callback and check the arguments,
4682 put the result into newpos and return the replacement string, which
4683 has to be freed by the caller */
4684static PyObject *unicode_translate_call_errorhandler(const char *errors,
4685 PyObject **errorHandler,
4686 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4688 Py_ssize_t startpos, Py_ssize_t endpos,
4689 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004691 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692
Martin v. Löwis412fb672006-04-13 06:34:32 +00004693 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 PyObject *restuple;
4695 PyObject *resunicode;
4696
4697 if (*errorHandler == NULL) {
4698 *errorHandler = PyCodec_LookupError(errors);
4699 if (*errorHandler == NULL)
4700 return NULL;
4701 }
4702
4703 make_translate_exception(exceptionObject,
4704 unicode, size, startpos, endpos, reason);
4705 if (*exceptionObject == NULL)
4706 return NULL;
4707
4708 restuple = PyObject_CallFunctionObjArgs(
4709 *errorHandler, *exceptionObject, NULL);
4710 if (restuple == NULL)
4711 return NULL;
4712 if (!PyTuple_Check(restuple)) {
4713 PyErr_Format(PyExc_TypeError, &argparse[4]);
4714 Py_DECREF(restuple);
4715 return NULL;
4716 }
4717 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004718 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_DECREF(restuple);
4720 return NULL;
4721 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 if (i_newpos<0)
4723 *newpos = size+i_newpos;
4724 else
4725 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004726 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004727 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004728 Py_DECREF(restuple);
4729 return NULL;
4730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731 Py_INCREF(resunicode);
4732 Py_DECREF(restuple);
4733 return resunicode;
4734}
4735
4736/* Lookup the character ch in the mapping and put the result in result,
4737 which must be decrefed by the caller.
4738 Return 0 on success, -1 on error */
4739static
4740int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4741{
4742 PyObject *w = PyInt_FromLong((long)c);
4743 PyObject *x;
4744
4745 if (w == NULL)
4746 return -1;
4747 x = PyObject_GetItem(mapping, w);
4748 Py_DECREF(w);
4749 if (x == NULL) {
4750 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4751 /* No mapping found means: use 1:1 mapping. */
4752 PyErr_Clear();
4753 *result = NULL;
4754 return 0;
4755 } else
4756 return -1;
4757 }
4758 else if (x == Py_None) {
4759 *result = x;
4760 return 0;
4761 }
4762 else if (PyInt_Check(x)) {
4763 long value = PyInt_AS_LONG(x);
4764 long max = PyUnicode_GetMax();
4765 if (value < 0 || value > max) {
4766 PyErr_Format(PyExc_TypeError,
4767 "character mapping must be in range(0x%lx)", max+1);
4768 Py_DECREF(x);
4769 return -1;
4770 }
4771 *result = x;
4772 return 0;
4773 }
4774 else if (PyUnicode_Check(x)) {
4775 *result = x;
4776 return 0;
4777 }
4778 else {
4779 /* wrong return value */
4780 PyErr_SetString(PyExc_TypeError,
4781 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004782 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783 return -1;
4784 }
4785}
4786/* ensure that *outobj is at least requiredsize characters long,
4787if not reallocate and adjust various state variables.
4788Return 0 on success, -1 on error */
4789static
Walter Dörwald4894c302003-10-24 14:25:28 +00004790int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004793 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004794 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004796 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004798 if (requiredsize < 2 * oldsize)
4799 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004800 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 return -1;
4802 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 }
4804 return 0;
4805}
4806/* lookup the character, put the result in the output string and adjust
4807 various state variables. Return a new reference to the object that
4808 was put in the output buffer in *result, or Py_None, if the mapping was
4809 undefined (in which case no character was written).
4810 The called must decref result.
4811 Return 0 on success, -1 on error. */
4812static
Walter Dörwald4894c302003-10-24 14:25:28 +00004813int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004814 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004815 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816{
Walter Dörwald4894c302003-10-24 14:25:28 +00004817 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 return -1;
4819 if (*res==NULL) {
4820 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 }
4823 else if (*res==Py_None)
4824 ;
4825 else if (PyInt_Check(*res)) {
4826 /* no overflow check, because we know that the space is enough */
4827 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4828 }
4829 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004830 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 if (repsize==1) {
4832 /* no overflow check, because we know that the space is enough */
4833 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4834 }
4835 else if (repsize!=0) {
4836 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004838 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004839 repsize - 1;
4840 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 return -1;
4842 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4843 *outp += repsize;
4844 }
4845 }
4846 else
4847 return -1;
4848 return 0;
4849}
4850
4851PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 PyObject *mapping,
4854 const char *errors)
4855{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 /* output object */
4857 PyObject *res = NULL;
4858 /* pointers to the beginning and end+1 of input */
4859 const Py_UNICODE *startp = p;
4860 const Py_UNICODE *endp = p + size;
4861 /* pointer into the output */
4862 Py_UNICODE *str;
4863 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 char *reason = "character maps to <undefined>";
4866 PyObject *errorHandler = NULL;
4867 PyObject *exc = NULL;
4868 /* the following variable is used for caching string comparisons
4869 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4870 * 3=ignore, 4=xmlcharrefreplace */
4871 int known_errorHandler = -1;
4872
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 if (mapping == NULL) {
4874 PyErr_BadArgument();
4875 return NULL;
4876 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877
4878 /* allocate enough for a simple 1:1 translation without
4879 replacements, if we need more, we'll resize */
4880 res = PyUnicode_FromUnicode(NULL, size);
4881 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004882 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 return res;
4885 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 while (p<endp) {
4888 /* try to encode it */
4889 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004890 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 goto onError;
4893 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004894 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 if (x!=Py_None) /* it worked => adjust input pointer */
4896 ++p;
4897 else { /* untranslatable character */
4898 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t repsize;
4900 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 Py_UNICODE *uni2;
4902 /* startpos for collecting untranslatable chars */
4903 const Py_UNICODE *collstart = p;
4904 const Py_UNICODE *collend = p+1;
4905 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 /* find all untranslatable characters */
4908 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004909 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 goto onError;
4911 Py_XDECREF(x);
4912 if (x!=Py_None)
4913 break;
4914 ++collend;
4915 }
4916 /* cache callback name lookup
4917 * (if not done yet, i.e. it's the first error) */
4918 if (known_errorHandler==-1) {
4919 if ((errors==NULL) || (!strcmp(errors, "strict")))
4920 known_errorHandler = 1;
4921 else if (!strcmp(errors, "replace"))
4922 known_errorHandler = 2;
4923 else if (!strcmp(errors, "ignore"))
4924 known_errorHandler = 3;
4925 else if (!strcmp(errors, "xmlcharrefreplace"))
4926 known_errorHandler = 4;
4927 else
4928 known_errorHandler = 0;
4929 }
4930 switch (known_errorHandler) {
4931 case 1: /* strict */
4932 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4933 goto onError;
4934 case 2: /* replace */
4935 /* No need to check for space, this is a 1:1 replacement */
4936 for (coll = collstart; coll<collend; ++coll)
4937 *str++ = '?';
4938 /* fall through */
4939 case 3: /* ignore */
4940 p = collend;
4941 break;
4942 case 4: /* xmlcharrefreplace */
4943 /* generate replacement (temporarily (mis)uses p) */
4944 for (p = collstart; p < collend; ++p) {
4945 char buffer[2+29+1+1];
4946 char *cp;
4947 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004948 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4950 goto onError;
4951 for (cp = buffer; *cp; ++cp)
4952 *str++ = *cp;
4953 }
4954 p = collend;
4955 break;
4956 default:
4957 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4958 reason, startp, size, &exc,
4959 collstart-startp, collend-startp, &newpos);
4960 if (repunicode == NULL)
4961 goto onError;
4962 /* generate replacement */
4963 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004964 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4966 Py_DECREF(repunicode);
4967 goto onError;
4968 }
4969 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4970 *str++ = *uni2;
4971 p = startp + newpos;
4972 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 }
4974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 /* Resize if we allocated to much */
4977 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004978 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004979 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 }
4982 Py_XDECREF(exc);
4983 Py_XDECREF(errorHandler);
4984 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 onError:
4987 Py_XDECREF(res);
4988 Py_XDECREF(exc);
4989 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 return NULL;
4991}
4992
4993PyObject *PyUnicode_Translate(PyObject *str,
4994 PyObject *mapping,
4995 const char *errors)
4996{
4997 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004998
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 str = PyUnicode_FromObject(str);
5000 if (str == NULL)
5001 goto onError;
5002 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5003 PyUnicode_GET_SIZE(str),
5004 mapping,
5005 errors);
5006 Py_DECREF(str);
5007 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005008
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 onError:
5010 Py_XDECREF(str);
5011 return NULL;
5012}
Tim Petersced69f82003-09-16 20:30:58 +00005013
Guido van Rossum9e896b32000-04-05 20:11:21 +00005014/* --- Decimal Encoder ---------------------------------------------------- */
5015
5016int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005017 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005018 char *output,
5019 const char *errors)
5020{
5021 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 PyObject *errorHandler = NULL;
5023 PyObject *exc = NULL;
5024 const char *encoding = "decimal";
5025 const char *reason = "invalid decimal Unicode string";
5026 /* the following variable is used for caching string comparisons
5027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5028 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005029
5030 if (output == NULL) {
5031 PyErr_BadArgument();
5032 return -1;
5033 }
5034
5035 p = s;
5036 end = s + length;
5037 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005039 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005041 Py_ssize_t repsize;
5042 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 Py_UNICODE *uni2;
5044 Py_UNICODE *collstart;
5045 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005046
Guido van Rossum9e896b32000-04-05 20:11:21 +00005047 if (Py_UNICODE_ISSPACE(ch)) {
5048 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005050 continue;
5051 }
5052 decimal = Py_UNICODE_TODECIMAL(ch);
5053 if (decimal >= 0) {
5054 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005056 continue;
5057 }
Guido van Rossumba477042000-04-06 18:18:10 +00005058 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005059 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005061 continue;
5062 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 /* All other characters are considered unencodable */
5064 collstart = p;
5065 collend = p+1;
5066 while (collend < end) {
5067 if ((0 < *collend && *collend < 256) ||
5068 !Py_UNICODE_ISSPACE(*collend) ||
5069 Py_UNICODE_TODECIMAL(*collend))
5070 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005072 /* cache callback name lookup
5073 * (if not done yet, i.e. it's the first error) */
5074 if (known_errorHandler==-1) {
5075 if ((errors==NULL) || (!strcmp(errors, "strict")))
5076 known_errorHandler = 1;
5077 else if (!strcmp(errors, "replace"))
5078 known_errorHandler = 2;
5079 else if (!strcmp(errors, "ignore"))
5080 known_errorHandler = 3;
5081 else if (!strcmp(errors, "xmlcharrefreplace"))
5082 known_errorHandler = 4;
5083 else
5084 known_errorHandler = 0;
5085 }
5086 switch (known_errorHandler) {
5087 case 1: /* strict */
5088 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5089 goto onError;
5090 case 2: /* replace */
5091 for (p = collstart; p < collend; ++p)
5092 *output++ = '?';
5093 /* fall through */
5094 case 3: /* ignore */
5095 p = collend;
5096 break;
5097 case 4: /* xmlcharrefreplace */
5098 /* generate replacement (temporarily (mis)uses p) */
5099 for (p = collstart; p < collend; ++p)
5100 output += sprintf(output, "&#%d;", (int)*p);
5101 p = collend;
5102 break;
5103 default:
5104 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5105 encoding, reason, s, length, &exc,
5106 collstart-s, collend-s, &newpos);
5107 if (repunicode == NULL)
5108 goto onError;
5109 /* generate replacement */
5110 repsize = PyUnicode_GET_SIZE(repunicode);
5111 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5112 Py_UNICODE ch = *uni2;
5113 if (Py_UNICODE_ISSPACE(ch))
5114 *output++ = ' ';
5115 else {
5116 decimal = Py_UNICODE_TODECIMAL(ch);
5117 if (decimal >= 0)
5118 *output++ = '0' + decimal;
5119 else if (0 < ch && ch < 256)
5120 *output++ = (char)ch;
5121 else {
5122 Py_DECREF(repunicode);
5123 raise_encode_exception(&exc, encoding,
5124 s, length, collstart-s, collend-s, reason);
5125 goto onError;
5126 }
5127 }
5128 }
5129 p = s + newpos;
5130 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005131 }
5132 }
5133 /* 0-terminate the output string */
5134 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 Py_XDECREF(exc);
5136 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005137 return 0;
5138
5139 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 Py_XDECREF(exc);
5141 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005142 return -1;
5143}
5144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145/* --- Helpers ------------------------------------------------------------ */
5146
Eric Smitha9f7d622008-02-17 19:46:49 +00005147#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005148
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005149#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005150
Fredrik Lundha50d2012006-05-26 17:04:58 +00005151#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005152
5153#include "stringlib/count.h"
5154#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005155#include "stringlib/partition.h"
5156
Fredrik Lundhc8162812006-05-26 19:33:03 +00005157/* helper macro to fixup start/end slice values */
5158#define FIX_START_END(obj) \
5159 if (start < 0) \
5160 start += (obj)->length; \
5161 if (start < 0) \
5162 start = 0; \
5163 if (end > (obj)->length) \
5164 end = (obj)->length; \
5165 if (end < 0) \
5166 end += (obj)->length; \
5167 if (end < 0) \
5168 end = 0;
5169
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005171 PyObject *substr,
5172 Py_ssize_t start,
5173 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005175 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005176 PyUnicodeObject* str_obj;
5177 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005178
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005179 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5180 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005182 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5183 if (!sub_obj) {
5184 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return -1;
5186 }
Tim Petersced69f82003-09-16 20:30:58 +00005187
Fredrik Lundhc8162812006-05-26 19:33:03 +00005188 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005189
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005190 result = stringlib_count(
5191 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5192 );
5193
5194 Py_DECREF(sub_obj);
5195 Py_DECREF(str_obj);
5196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return result;
5198}
5199
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005201 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005202 Py_ssize_t start,
5203 Py_ssize_t end,
5204 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005208 str = PyUnicode_FromObject(str);
5209 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005210 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005211 sub = PyUnicode_FromObject(sub);
5212 if (!sub) {
5213 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005214 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 }
Tim Petersced69f82003-09-16 20:30:58 +00005216
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005217 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005218 result = stringlib_find_slice(
5219 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5220 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5221 start, end
5222 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005223 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005224 result = stringlib_rfind_slice(
5225 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5226 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5227 start, end
5228 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005229
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005230 Py_DECREF(str);
5231 Py_DECREF(sub);
5232
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 return result;
5234}
5235
Tim Petersced69f82003-09-16 20:30:58 +00005236static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237int tailmatch(PyUnicodeObject *self,
5238 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t start,
5240 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 int direction)
5242{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 if (substring->length == 0)
5244 return 1;
5245
Fredrik Lundhc8162812006-05-26 19:33:03 +00005246 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
5248 end -= substring->length;
5249 if (end < start)
5250 return 0;
5251
5252 if (direction > 0) {
5253 if (Py_UNICODE_MATCH(self, end, substring))
5254 return 1;
5255 } else {
5256 if (Py_UNICODE_MATCH(self, start, substring))
5257 return 1;
5258 }
5259
5260 return 0;
5261}
5262
Martin v. Löwis18e16552006-02-15 17:27:45 +00005263Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t start,
5266 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 int direction)
5268{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 str = PyUnicode_FromObject(str);
5272 if (str == NULL)
5273 return -1;
5274 substr = PyUnicode_FromObject(substr);
5275 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005276 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 return -1;
5278 }
Tim Petersced69f82003-09-16 20:30:58 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 result = tailmatch((PyUnicodeObject *)str,
5281 (PyUnicodeObject *)substr,
5282 start, end, direction);
5283 Py_DECREF(str);
5284 Py_DECREF(substr);
5285 return result;
5286}
5287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288/* Apply fixfct filter to the Unicode object self and return a
5289 reference to the modified object */
5290
Tim Petersced69f82003-09-16 20:30:58 +00005291static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292PyObject *fixup(PyUnicodeObject *self,
5293 int (*fixfct)(PyUnicodeObject *s))
5294{
5295
5296 PyUnicodeObject *u;
5297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005298 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 if (u == NULL)
5300 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005301
5302 Py_UNICODE_COPY(u->str, self->str, self->length);
5303
Tim Peters7a29bd52001-09-12 03:03:31 +00005304 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 /* fixfct should return TRUE if it modified the buffer. If
5306 FALSE, return a reference to the original buffer instead
5307 (to save space, not time) */
5308 Py_INCREF(self);
5309 Py_DECREF(u);
5310 return (PyObject*) self;
5311 }
5312 return (PyObject*) u;
5313}
5314
Tim Petersced69f82003-09-16 20:30:58 +00005315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316int fixupper(PyUnicodeObject *self)
5317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 Py_UNICODE *s = self->str;
5320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 while (len-- > 0) {
5323 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005324
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 ch = Py_UNICODE_TOUPPER(*s);
5326 if (ch != *s) {
5327 status = 1;
5328 *s = ch;
5329 }
5330 s++;
5331 }
5332
5333 return status;
5334}
5335
Tim Petersced69f82003-09-16 20:30:58 +00005336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337int fixlower(PyUnicodeObject *self)
5338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 Py_UNICODE *s = self->str;
5341 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 while (len-- > 0) {
5344 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 ch = Py_UNICODE_TOLOWER(*s);
5347 if (ch != *s) {
5348 status = 1;
5349 *s = ch;
5350 }
5351 s++;
5352 }
5353
5354 return status;
5355}
5356
Tim Petersced69f82003-09-16 20:30:58 +00005357static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358int fixswapcase(PyUnicodeObject *self)
5359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005360 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 Py_UNICODE *s = self->str;
5362 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005363
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 while (len-- > 0) {
5365 if (Py_UNICODE_ISUPPER(*s)) {
5366 *s = Py_UNICODE_TOLOWER(*s);
5367 status = 1;
5368 } else if (Py_UNICODE_ISLOWER(*s)) {
5369 *s = Py_UNICODE_TOUPPER(*s);
5370 status = 1;
5371 }
5372 s++;
5373 }
5374
5375 return status;
5376}
5377
Tim Petersced69f82003-09-16 20:30:58 +00005378static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379int fixcapitalize(PyUnicodeObject *self)
5380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005382 Py_UNICODE *s = self->str;
5383 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005384
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005385 if (len == 0)
5386 return 0;
5387 if (Py_UNICODE_ISLOWER(*s)) {
5388 *s = Py_UNICODE_TOUPPER(*s);
5389 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005391 s++;
5392 while (--len > 0) {
5393 if (Py_UNICODE_ISUPPER(*s)) {
5394 *s = Py_UNICODE_TOLOWER(*s);
5395 status = 1;
5396 }
5397 s++;
5398 }
5399 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400}
5401
5402static
5403int fixtitle(PyUnicodeObject *self)
5404{
5405 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5406 register Py_UNICODE *e;
5407 int previous_is_cased;
5408
5409 /* Shortcut for single character strings */
5410 if (PyUnicode_GET_SIZE(self) == 1) {
5411 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5412 if (*p != ch) {
5413 *p = ch;
5414 return 1;
5415 }
5416 else
5417 return 0;
5418 }
Tim Petersced69f82003-09-16 20:30:58 +00005419
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 e = p + PyUnicode_GET_SIZE(self);
5421 previous_is_cased = 0;
5422 for (; p < e; p++) {
5423 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 if (previous_is_cased)
5426 *p = Py_UNICODE_TOLOWER(ch);
5427 else
5428 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005429
5430 if (Py_UNICODE_ISLOWER(ch) ||
5431 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 Py_UNICODE_ISTITLE(ch))
5433 previous_is_cased = 1;
5434 else
5435 previous_is_cased = 0;
5436 }
5437 return 1;
5438}
5439
Tim Peters8ce9f162004-08-27 01:49:32 +00005440PyObject *
5441PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442{
Tim Peters8ce9f162004-08-27 01:49:32 +00005443 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005444 const Py_UNICODE blank = ' ';
5445 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005446 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005447 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005448 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5449 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005450 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5451 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005452 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005453 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005454 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 fseq = PySequence_Fast(seq, "");
5457 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005459 }
5460
Tim Peters91879ab2004-08-27 22:35:44 +00005461 /* Grrrr. A codec may be invoked to convert str objects to
5462 * Unicode, and so it's possible to call back into Python code
5463 * during PyUnicode_FromObject(), and so it's possible for a sick
5464 * codec to change the size of fseq (if seq is a list). Therefore
5465 * we have to keep refetching the size -- can't assume seqlen
5466 * is invariant.
5467 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005468 seqlen = PySequence_Fast_GET_SIZE(fseq);
5469 /* If empty sequence, return u"". */
5470 if (seqlen == 0) {
5471 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5472 goto Done;
5473 }
5474 /* If singleton sequence with an exact Unicode, return that. */
5475 if (seqlen == 1) {
5476 item = PySequence_Fast_GET_ITEM(fseq, 0);
5477 if (PyUnicode_CheckExact(item)) {
5478 Py_INCREF(item);
5479 res = (PyUnicodeObject *)item;
5480 goto Done;
5481 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005482 }
5483
Tim Peters05eba1f2004-08-27 21:32:02 +00005484 /* At least two items to join, or one that isn't exact Unicode. */
5485 if (seqlen > 1) {
5486 /* Set up sep and seplen -- they're needed. */
5487 if (separator == NULL) {
5488 sep = &blank;
5489 seplen = 1;
5490 }
5491 else {
5492 internal_separator = PyUnicode_FromObject(separator);
5493 if (internal_separator == NULL)
5494 goto onError;
5495 sep = PyUnicode_AS_UNICODE(internal_separator);
5496 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005499 }
5500 }
5501
5502 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005503 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005504 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005505 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005506 res_p = PyUnicode_AS_UNICODE(res);
5507 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005508
Tim Peters05eba1f2004-08-27 21:32:02 +00005509 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005510 Py_ssize_t itemlen;
5511 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005512
5513 item = PySequence_Fast_GET_ITEM(fseq, i);
5514 /* Convert item to Unicode. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00005515 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005517 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005518 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005519 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005520 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005521 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005522 item = PyUnicode_FromObject(item);
5523 if (item == NULL)
5524 goto onError;
5525 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005526
Tim Peters91879ab2004-08-27 22:35:44 +00005527 /* In case PyUnicode_FromObject() mutated seq. */
5528 seqlen = PySequence_Fast_GET_SIZE(fseq);
5529
Tim Peters8ce9f162004-08-27 01:49:32 +00005530 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005533 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 if (i < seqlen - 1) {
5536 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005537 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 goto Overflow;
5539 }
5540 if (new_res_used > res_alloc) {
5541 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005543 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005544 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005545 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005546 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005547 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005548 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005550 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005553
5554 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005555 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005556 res_p += itemlen;
5557 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005558 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005559 res_p += seplen;
5560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005562 res_used = new_res_used;
5563 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005564
Tim Peters05eba1f2004-08-27 21:32:02 +00005565 /* Shrink res to match the used area; this probably can't fail,
5566 * but it's cheap to check.
5567 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005568 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005569 goto onError;
5570
5571 Done:
5572 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005573 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return (PyObject *)res;
5575
Tim Peters8ce9f162004-08-27 01:49:32 +00005576 Overflow:
5577 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005578 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005579 Py_DECREF(item);
5580 /* fall through */
5581
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005583 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 return NULL;
5587}
5588
Tim Petersced69f82003-09-16 20:30:58 +00005589static
5590PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 Py_ssize_t left,
5592 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 Py_UNICODE fill)
5594{
5595 PyUnicodeObject *u;
5596
5597 if (left < 0)
5598 left = 0;
5599 if (right < 0)
5600 right = 0;
5601
Tim Peters7a29bd52001-09-12 03:03:31 +00005602 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 Py_INCREF(self);
5604 return self;
5605 }
5606
Neal Norwitze7d8be82008-07-31 17:17:14 +00005607 if (left > PY_SSIZE_T_MAX - self->length ||
5608 right > PY_SSIZE_T_MAX - (left + self->length)) {
5609 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5610 return NULL;
5611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 u = _PyUnicode_New(left + self->length + right);
5613 if (u) {
5614 if (left)
5615 Py_UNICODE_FILL(u->str, fill, left);
5616 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5617 if (right)
5618 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5619 }
5620
5621 return u;
5622}
5623
5624#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005625 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 if (!str) \
5627 goto onError; \
5628 if (PyList_Append(list, str)) { \
5629 Py_DECREF(str); \
5630 goto onError; \
5631 } \
5632 else \
5633 Py_DECREF(str);
5634
5635static
5636PyObject *split_whitespace(PyUnicodeObject *self,
5637 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 register Py_ssize_t i;
5641 register Py_ssize_t j;
5642 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005644 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
5646 for (i = j = 0; i < len; ) {
5647 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005648 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 i++;
5650 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005651 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 i++;
5653 if (j < i) {
5654 if (maxcount-- <= 0)
5655 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005656 SPLIT_APPEND(buf, j, i);
5657 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 i++;
5659 j = i;
5660 }
5661 }
5662 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005663 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 }
5665 return list;
5666
5667 onError:
5668 Py_DECREF(list);
5669 return NULL;
5670}
5671
5672PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005673 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 PyObject *list;
5679 PyObject *str;
5680 Py_UNICODE *data;
5681
5682 string = PyUnicode_FromObject(string);
5683 if (string == NULL)
5684 return NULL;
5685 data = PyUnicode_AS_UNICODE(string);
5686 len = PyUnicode_GET_SIZE(string);
5687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 list = PyList_New(0);
5689 if (!list)
5690 goto onError;
5691
5692 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005693 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005696 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
5699 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005700 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 if (i < len) {
5702 if (data[i] == '\r' && i + 1 < len &&
5703 data[i+1] == '\n')
5704 i += 2;
5705 else
5706 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005707 if (keepends)
5708 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 }
Guido van Rossum86662912000-04-11 15:38:46 +00005710 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 j = i;
5712 }
5713 if (j < len) {
5714 SPLIT_APPEND(data, j, len);
5715 }
5716
5717 Py_DECREF(string);
5718 return list;
5719
5720 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005721 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 Py_DECREF(string);
5723 return NULL;
5724}
5725
Tim Petersced69f82003-09-16 20:30:58 +00005726static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727PyObject *split_char(PyUnicodeObject *self,
5728 PyObject *list,
5729 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005732 register Py_ssize_t i;
5733 register Py_ssize_t j;
5734 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005736 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
5738 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005739 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 if (maxcount-- <= 0)
5741 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005742 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 i = j = i + 1;
5744 } else
5745 i++;
5746 }
5747 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005748 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 }
5750 return list;
5751
5752 onError:
5753 Py_DECREF(list);
5754 return NULL;
5755}
5756
Tim Petersced69f82003-09-16 20:30:58 +00005757static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758PyObject *split_substring(PyUnicodeObject *self,
5759 PyObject *list,
5760 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005763 register Py_ssize_t i;
5764 register Py_ssize_t j;
5765 Py_ssize_t len = self->length;
5766 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 PyObject *str;
5768
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005769 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 if (Py_UNICODE_MATCH(self, i, substring)) {
5771 if (maxcount-- <= 0)
5772 break;
5773 SPLIT_APPEND(self->str, j, i);
5774 i = j = i + sublen;
5775 } else
5776 i++;
5777 }
5778 if (j <= len) {
5779 SPLIT_APPEND(self->str, j, len);
5780 }
5781 return list;
5782
5783 onError:
5784 Py_DECREF(list);
5785 return NULL;
5786}
5787
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788static
5789PyObject *rsplit_whitespace(PyUnicodeObject *self,
5790 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005791 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 register Py_ssize_t i;
5794 register Py_ssize_t j;
5795 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005797 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798
5799 for (i = j = len - 1; i >= 0; ) {
5800 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005801 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802 i--;
5803 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005804 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805 i--;
5806 if (j > i) {
5807 if (maxcount-- <= 0)
5808 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005809 SPLIT_APPEND(buf, i + 1, j + 1);
5810 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 i--;
5812 j = i;
5813 }
5814 }
5815 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005816 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005818 if (PyList_Reverse(list) < 0)
5819 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820 return list;
5821
5822 onError:
5823 Py_DECREF(list);
5824 return NULL;
5825}
5826
5827static
5828PyObject *rsplit_char(PyUnicodeObject *self,
5829 PyObject *list,
5830 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005836 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005837 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838
5839 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005840 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 if (maxcount-- <= 0)
5842 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005843 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 j = i = i - 1;
5845 } else
5846 i--;
5847 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005848 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005849 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005850 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005851 if (PyList_Reverse(list) < 0)
5852 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005853 return list;
5854
5855 onError:
5856 Py_DECREF(list);
5857 return NULL;
5858}
5859
5860static
5861PyObject *rsplit_substring(PyUnicodeObject *self,
5862 PyObject *list,
5863 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 register Py_ssize_t i;
5867 register Py_ssize_t j;
5868 Py_ssize_t len = self->length;
5869 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870 PyObject *str;
5871
5872 for (i = len - sublen, j = len; i >= 0; ) {
5873 if (Py_UNICODE_MATCH(self, i, substring)) {
5874 if (maxcount-- <= 0)
5875 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005876 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005877 j = i;
5878 i -= sublen;
5879 } else
5880 i--;
5881 }
5882 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005883 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005884 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005885 if (PyList_Reverse(list) < 0)
5886 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005887 return list;
5888
5889 onError:
5890 Py_DECREF(list);
5891 return NULL;
5892}
5893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894#undef SPLIT_APPEND
5895
5896static
5897PyObject *split(PyUnicodeObject *self,
5898 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
5901 PyObject *list;
5902
5903 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005904 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
5906 list = PyList_New(0);
5907 if (!list)
5908 return NULL;
5909
5910 if (substring == NULL)
5911 return split_whitespace(self,list,maxcount);
5912
5913 else if (substring->length == 1)
5914 return split_char(self,list,substring->str[0],maxcount);
5915
5916 else if (substring->length == 0) {
5917 Py_DECREF(list);
5918 PyErr_SetString(PyExc_ValueError, "empty separator");
5919 return NULL;
5920 }
5921 else
5922 return split_substring(self,list,substring,maxcount);
5923}
5924
Tim Petersced69f82003-09-16 20:30:58 +00005925static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005926PyObject *rsplit(PyUnicodeObject *self,
5927 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005928 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005929{
5930 PyObject *list;
5931
5932 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005933 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005934
5935 list = PyList_New(0);
5936 if (!list)
5937 return NULL;
5938
5939 if (substring == NULL)
5940 return rsplit_whitespace(self,list,maxcount);
5941
5942 else if (substring->length == 1)
5943 return rsplit_char(self,list,substring->str[0],maxcount);
5944
5945 else if (substring->length == 0) {
5946 Py_DECREF(list);
5947 PyErr_SetString(PyExc_ValueError, "empty separator");
5948 return NULL;
5949 }
5950 else
5951 return rsplit_substring(self,list,substring,maxcount);
5952}
5953
5954static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955PyObject *replace(PyUnicodeObject *self,
5956 PyUnicodeObject *str1,
5957 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
5960 PyUnicodeObject *u;
5961
5962 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005963 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
Fredrik Lundh347ee272006-05-24 16:35:18 +00005965 if (str1->length == str2->length) {
5966 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005967 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (str1->length == 1) {
5969 /* replace characters */
5970 Py_UNICODE u1, u2;
5971 if (!findchar(self->str, self->length, str1->str[0]))
5972 goto nothing;
5973 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5974 if (!u)
5975 return NULL;
5976 Py_UNICODE_COPY(u->str, self->str, self->length);
5977 u1 = str1->str[0];
5978 u2 = str2->str[0];
5979 for (i = 0; i < u->length; i++)
5980 if (u->str[i] == u1) {
5981 if (--maxcount < 0)
5982 break;
5983 u->str[i] = u2;
5984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005986 i = fastsearch(
5987 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005989 if (i < 0)
5990 goto nothing;
5991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5992 if (!u)
5993 return NULL;
5994 Py_UNICODE_COPY(u->str, self->str, self->length);
5995 while (i <= self->length - str1->length)
5996 if (Py_UNICODE_MATCH(self, i, str1)) {
5997 if (--maxcount < 0)
5998 break;
5999 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6000 i += str1->length;
6001 } else
6002 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006005
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006006 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006007 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 Py_UNICODE *p;
6009
6010 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006011 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 if (n > maxcount)
6013 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006014 if (n == 0)
6015 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006016 /* new_size = self->length + n * (str2->length - str1->length)); */
6017 delta = (str2->length - str1->length);
6018 if (delta == 0) {
6019 new_size = self->length;
6020 } else {
6021 product = n * (str2->length - str1->length);
6022 if ((product / (str2->length - str1->length)) != n) {
6023 PyErr_SetString(PyExc_OverflowError,
6024 "replace string is too long");
6025 return NULL;
6026 }
6027 new_size = self->length + product;
6028 if (new_size < 0) {
6029 PyErr_SetString(PyExc_OverflowError,
6030 "replace string is too long");
6031 return NULL;
6032 }
6033 }
6034 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006035 if (!u)
6036 return NULL;
6037 i = 0;
6038 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006039 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006040 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006041 while (n-- > 0) {
6042 /* look for next match */
6043 j = i;
6044 while (j <= e) {
6045 if (Py_UNICODE_MATCH(self, j, str1))
6046 break;
6047 j++;
6048 }
6049 if (j > i) {
6050 if (j > e)
6051 break;
6052 /* copy unchanged part [i:j] */
6053 Py_UNICODE_COPY(p, self->str+i, j-i);
6054 p += j - i;
6055 }
6056 /* copy substitution string */
6057 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006058 Py_UNICODE_COPY(p, str2->str, str2->length);
6059 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006060 }
6061 i = j + str1->length;
6062 }
6063 if (i < self->length)
6064 /* copy tail [i:] */
6065 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006066 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006067 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006068 while (n > 0) {
6069 Py_UNICODE_COPY(p, str2->str, str2->length);
6070 p += str2->length;
6071 if (--n <= 0)
6072 break;
6073 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006075 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
6077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006079
6080nothing:
6081 /* nothing to replace; return original string (when possible) */
6082 if (PyUnicode_CheckExact(self)) {
6083 Py_INCREF(self);
6084 return (PyObject *) self;
6085 }
6086 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087}
6088
6089/* --- Unicode Object Methods --------------------------------------------- */
6090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092"S.title() -> unicode\n\
6093\n\
6094Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
6097static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006098unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 return fixup(self, fixtitle);
6101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104"S.capitalize() -> unicode\n\
6105\n\
6106Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006107have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
6109static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006110unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 return fixup(self, fixcapitalize);
6113}
6114
6115#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006116PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117"S.capwords() -> unicode\n\
6118\n\
6119Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006123unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124{
6125 PyObject *list;
6126 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 /* Split into words */
6130 list = split(self, NULL, -1);
6131 if (!list)
6132 return NULL;
6133
6134 /* Capitalize each word */
6135 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6137 fixcapitalize);
6138 if (item == NULL)
6139 goto onError;
6140 Py_DECREF(PyList_GET_ITEM(list, i));
6141 PyList_SET_ITEM(list, i, item);
6142 }
6143
6144 /* Join the words to form a new string */
6145 item = PyUnicode_Join(NULL, list);
6146
6147onError:
6148 Py_DECREF(list);
6149 return (PyObject *)item;
6150}
6151#endif
6152
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006153/* Argument converter. Coerces to a single unicode character */
6154
6155static int
6156convert_uc(PyObject *obj, void *addr)
6157{
6158 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6159 PyObject *uniobj;
6160 Py_UNICODE *unistr;
6161
6162 uniobj = PyUnicode_FromObject(obj);
6163 if (uniobj == NULL) {
6164 PyErr_SetString(PyExc_TypeError,
6165 "The fill character cannot be converted to Unicode");
6166 return 0;
6167 }
6168 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6169 PyErr_SetString(PyExc_TypeError,
6170 "The fill character must be exactly one character long");
6171 Py_DECREF(uniobj);
6172 return 0;
6173 }
6174 unistr = PyUnicode_AS_UNICODE(uniobj);
6175 *fillcharloc = unistr[0];
6176 Py_DECREF(uniobj);
6177 return 1;
6178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006181"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183Return S centered in a Unicode string of length width. Padding is\n\
6184done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject *
6187unicode_center(PyUnicodeObject *self, PyObject *args)
6188{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 Py_ssize_t marg, left;
6190 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006191 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Thomas Woutersde017742006-02-16 19:34:37 +00006193 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
6195
Tim Peters7a29bd52001-09-12 03:03:31 +00006196 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 Py_INCREF(self);
6198 return (PyObject*) self;
6199 }
6200
6201 marg = width - self->length;
6202 left = marg / 2 + (marg & width & 1);
6203
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006204 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205}
6206
Marc-André Lemburge5034372000-08-08 08:04:29 +00006207#if 0
6208
6209/* This code should go into some future Unicode collation support
6210 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006211 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006212
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006213/* speedy UTF-16 code point order comparison */
6214/* gleaned from: */
6215/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6216
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006217static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006218{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006219 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006220 0, 0, 0, 0, 0, 0, 0, 0,
6221 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006222 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006223};
6224
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225static int
6226unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006228 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006229
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 Py_UNICODE *s1 = str1->str;
6231 Py_UNICODE *s2 = str2->str;
6232
6233 len1 = str1->length;
6234 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006235
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006237 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006238
6239 c1 = *s1++;
6240 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006241
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006242 if (c1 > (1<<11) * 26)
6243 c1 += utf16Fixup[c1>>11];
6244 if (c2 > (1<<11) * 26)
6245 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006246 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006247
6248 if (c1 != c2)
6249 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006250
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006251 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
6253
6254 return (len1 < len2) ? -1 : (len1 != len2);
6255}
6256
Marc-André Lemburge5034372000-08-08 08:04:29 +00006257#else
6258
6259static int
6260unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006262 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006263
6264 Py_UNICODE *s1 = str1->str;
6265 Py_UNICODE *s2 = str2->str;
6266
6267 len1 = str1->length;
6268 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006269
Marc-André Lemburge5034372000-08-08 08:04:29 +00006270 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006271 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006272
Fredrik Lundh45714e92001-06-26 16:39:36 +00006273 c1 = *s1++;
6274 c2 = *s2++;
6275
6276 if (c1 != c2)
6277 return (c1 < c2) ? -1 : 1;
6278
Marc-André Lemburge5034372000-08-08 08:04:29 +00006279 len1--; len2--;
6280 }
6281
6282 return (len1 < len2) ? -1 : (len1 != len2);
6283}
6284
6285#endif
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287int PyUnicode_Compare(PyObject *left,
6288 PyObject *right)
6289{
6290 PyUnicodeObject *u = NULL, *v = NULL;
6291 int result;
6292
6293 /* Coerce the two arguments */
6294 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6295 if (u == NULL)
6296 goto onError;
6297 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6298 if (v == NULL)
6299 goto onError;
6300
Thomas Wouters7e474022000-07-16 12:04:32 +00006301 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 if (v == u) {
6303 Py_DECREF(u);
6304 Py_DECREF(v);
6305 return 0;
6306 }
6307
6308 result = unicode_compare(u, v);
6309
6310 Py_DECREF(u);
6311 Py_DECREF(v);
6312 return result;
6313
6314onError:
6315 Py_XDECREF(u);
6316 Py_XDECREF(v);
6317 return -1;
6318}
6319
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006320PyObject *PyUnicode_RichCompare(PyObject *left,
6321 PyObject *right,
6322 int op)
6323{
6324 int result;
6325
6326 result = PyUnicode_Compare(left, right);
6327 if (result == -1 && PyErr_Occurred())
6328 goto onError;
6329
6330 /* Convert the return value to a Boolean */
6331 switch (op) {
6332 case Py_EQ:
6333 result = (result == 0);
6334 break;
6335 case Py_NE:
6336 result = (result != 0);
6337 break;
6338 case Py_LE:
6339 result = (result <= 0);
6340 break;
6341 case Py_GE:
6342 result = (result >= 0);
6343 break;
6344 case Py_LT:
6345 result = (result == -1);
6346 break;
6347 case Py_GT:
6348 result = (result == 1);
6349 break;
6350 }
6351 return PyBool_FromLong(result);
6352
6353 onError:
6354
6355 /* Standard case
6356
6357 Type errors mean that PyUnicode_FromObject() could not convert
6358 one of the arguments (usually the right hand side) to Unicode,
6359 ie. we can't handle the comparison request. However, it is
6360 possible that the other object knows a comparison method, which
6361 is why we return Py_NotImplemented to give the other object a
6362 chance.
6363
6364 */
6365 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6366 PyErr_Clear();
6367 Py_INCREF(Py_NotImplemented);
6368 return Py_NotImplemented;
6369 }
6370 if (op != Py_EQ && op != Py_NE)
6371 return NULL;
6372
6373 /* Equality comparison.
6374
6375 This is a special case: we silence any PyExc_UnicodeDecodeError
6376 and instead turn it into a PyErr_UnicodeWarning.
6377
6378 */
6379 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6380 return NULL;
6381 PyErr_Clear();
6382 if (PyErr_Warn(PyExc_UnicodeWarning,
6383 (op == Py_EQ) ?
6384 "Unicode equal comparison "
6385 "failed to convert both arguments to Unicode - "
6386 "interpreting them as being unequal" :
6387 "Unicode unequal comparison "
6388 "failed to convert both arguments to Unicode - "
6389 "interpreting them as being unequal"
6390 ) < 0)
6391 return NULL;
6392 result = (op == Py_NE);
6393 return PyBool_FromLong(result);
6394}
6395
Guido van Rossum403d68b2000-03-13 15:55:09 +00006396int PyUnicode_Contains(PyObject *container,
6397 PyObject *element)
6398{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006399 PyObject *str, *sub;
6400 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006401
6402 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006403 sub = PyUnicode_FromObject(element);
6404 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006405 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006406 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006407 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006408 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006409
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006410 str = PyUnicode_FromObject(container);
6411 if (!str) {
6412 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006413 return -1;
6414 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006415
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006416 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006417
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006418 Py_DECREF(str);
6419 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006420
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006421 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006422}
6423
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424/* Concat to string or Unicode object giving a new Unicode object. */
6425
6426PyObject *PyUnicode_Concat(PyObject *left,
6427 PyObject *right)
6428{
6429 PyUnicodeObject *u = NULL, *v = NULL, *w;
6430
6431 /* Coerce the two arguments */
6432 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6433 if (u == NULL)
6434 goto onError;
6435 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6436 if (v == NULL)
6437 goto onError;
6438
6439 /* Shortcuts */
6440 if (v == unicode_empty) {
6441 Py_DECREF(v);
6442 return (PyObject *)u;
6443 }
6444 if (u == unicode_empty) {
6445 Py_DECREF(u);
6446 return (PyObject *)v;
6447 }
6448
6449 /* Concat the two Unicode strings */
6450 w = _PyUnicode_New(u->length + v->length);
6451 if (w == NULL)
6452 goto onError;
6453 Py_UNICODE_COPY(w->str, u->str, u->length);
6454 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6455
6456 Py_DECREF(u);
6457 Py_DECREF(v);
6458 return (PyObject *)w;
6459
6460onError:
6461 Py_XDECREF(u);
6462 Py_XDECREF(v);
6463 return NULL;
6464}
6465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006466PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467"S.count(sub[, start[, end]]) -> int\n\
6468\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006469Return the number of non-overlapping occurrences of substring sub in\n\
6470Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006471interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
6473static PyObject *
6474unicode_count(PyUnicodeObject *self, PyObject *args)
6475{
6476 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006478 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 PyObject *result;
6480
Guido van Rossumb8872e62000-05-09 14:14:27 +00006481 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6482 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 return NULL;
6484
6485 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006486 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 if (substring == NULL)
6488 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006489
Fredrik Lundhc8162812006-05-26 19:33:03 +00006490 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006492 result = PyInt_FromSsize_t(
6493 stringlib_count(self->str + start, end - start,
6494 substring->str, substring->length)
6495 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return result;
6500}
6501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006503"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006505Encodes S using the codec registered for encoding. encoding defaults\n\
6506to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006507handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6509'xmlcharrefreplace' as well as any other name registered with\n\
6510codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512static PyObject *
6513unicode_encode(PyUnicodeObject *self, PyObject *args)
6514{
6515 char *encoding = NULL;
6516 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006517 PyObject *v;
6518
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6520 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006521 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006522 if (v == NULL)
6523 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006524 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006525 PyErr_Format(PyExc_TypeError,
6526 "encoder did not return a string/unicode object "
6527 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006528 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006529 Py_DECREF(v);
6530 return NULL;
6531 }
6532 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006533
6534 onError:
6535 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006536}
6537
6538PyDoc_STRVAR(decode__doc__,
6539"S.decode([encoding[,errors]]) -> string or unicode\n\
6540\n\
6541Decodes S using the codec registered for encoding. encoding defaults\n\
6542to the default encoding. errors may be given to set a different error\n\
6543handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6544a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6545as well as any other name registerd with codecs.register_error that is\n\
6546able to handle UnicodeDecodeErrors.");
6547
6548static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006549unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006550{
6551 char *encoding = NULL;
6552 char *errors = NULL;
6553 PyObject *v;
6554
6555 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6556 return NULL;
6557 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006558 if (v == NULL)
6559 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006560 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006561 PyErr_Format(PyExc_TypeError,
6562 "decoder did not return a string/unicode object "
6563 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006564 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006565 Py_DECREF(v);
6566 return NULL;
6567 }
6568 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006569
6570 onError:
6571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572}
6573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006574PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575"S.expandtabs([tabsize]) -> unicode\n\
6576\n\
6577Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006578If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
6580static PyObject*
6581unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6582{
6583 Py_UNICODE *e;
6584 Py_UNICODE *p;
6585 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006586 Py_UNICODE *qe;
6587 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 PyUnicodeObject *u;
6589 int tabsize = 8;
6590
6591 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6592 return NULL;
6593
Thomas Wouters7e474022000-07-16 12:04:32 +00006594 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006595 i = 0; /* chars up to and including most recent \n or \r */
6596 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6597 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 for (p = self->str; p < e; p++)
6599 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006600 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006601 incr = tabsize - (j % tabsize); /* cannot overflow */
6602 if (j > PY_SSIZE_T_MAX - incr)
6603 goto overflow1;
6604 j += incr;
6605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
6607 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006608 if (j > PY_SSIZE_T_MAX - 1)
6609 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 j++;
6611 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006612 if (i > PY_SSIZE_T_MAX - j)
6613 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006615 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
6617 }
6618
Guido van Rossum5bdff602008-03-11 21:18:06 +00006619 if (i > PY_SSIZE_T_MAX - j)
6620 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 /* Second pass: create output string and fill it */
6623 u = _PyUnicode_New(i + j);
6624 if (!u)
6625 return NULL;
6626
Guido van Rossum5bdff602008-03-11 21:18:06 +00006627 j = 0; /* same as in first pass */
6628 q = u->str; /* next output char */
6629 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
6631 for (p = self->str; p < e; p++)
6632 if (*p == '\t') {
6633 if (tabsize > 0) {
6634 i = tabsize - (j % tabsize);
6635 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006636 while (i--) {
6637 if (q >= qe)
6638 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 }
6642 }
6643 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006644 if (q >= qe)
6645 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006647 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 if (*p == '\n' || *p == '\r')
6649 j = 0;
6650 }
6651
6652 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006653
6654 overflow2:
6655 Py_DECREF(u);
6656 overflow1:
6657 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006661PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662"S.find(sub [,start [,end]]) -> int\n\
6663\n\
6664Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006665such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666arguments start and end are interpreted as in slice notation.\n\
6667\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject *
6671unicode_find(PyUnicodeObject *self, PyObject *args)
6672{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006673 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006674 Py_ssize_t start;
6675 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006676 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
Facundo Batista57d56692007-11-16 18:04:14 +00006678 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006681 result = stringlib_find_slice(
6682 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6683 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6684 start, end
6685 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
6687 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006688
6689 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690}
6691
6692static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694{
6695 if (index < 0 || index >= self->length) {
6696 PyErr_SetString(PyExc_IndexError, "string index out of range");
6697 return NULL;
6698 }
6699
6700 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6701}
6702
6703static long
6704unicode_hash(PyUnicodeObject *self)
6705{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006706 /* Since Unicode objects compare equal to their ASCII string
6707 counterparts, they should use the individual character values
6708 as basis for their hash value. This is needed to assure that
6709 strings and Unicode objects behave in the same way as
6710 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
Martin v. Löwis18e16552006-02-15 17:27:45 +00006712 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006713 register Py_UNICODE *p;
6714 register long x;
6715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (self->hash != -1)
6717 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006718 len = PyUnicode_GET_SIZE(self);
6719 p = PyUnicode_AS_UNICODE(self);
6720 x = *p << 7;
6721 while (--len >= 0)
6722 x = (1000003*x) ^ *p++;
6723 x ^= PyUnicode_GET_SIZE(self);
6724 if (x == -1)
6725 x = -2;
6726 self->hash = x;
6727 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731"S.index(sub [,start [,end]]) -> int\n\
6732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject *
6736unicode_index(PyUnicodeObject *self, PyObject *args)
6737{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006738 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006739 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006740 Py_ssize_t start;
6741 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
Facundo Batista57d56692007-11-16 18:04:14 +00006743 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006746 result = stringlib_find_slice(
6747 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6748 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6749 start, end
6750 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 if (result < 0) {
6755 PyErr_SetString(PyExc_ValueError, "substring not found");
6756 return NULL;
6757 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006758
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760}
6761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006762PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006763"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006766at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
6768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006769unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6772 register const Py_UNICODE *e;
6773 int cased;
6774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 /* Shortcut for single character strings */
6776 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006779 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006780 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 e = p + PyUnicode_GET_SIZE(self);
6784 cased = 0;
6785 for (; p < e; p++) {
6786 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006789 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 else if (!cased && Py_UNICODE_ISLOWER(ch))
6791 cased = 1;
6792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794}
6795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006796PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006799Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
6802static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006803unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
6805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806 register const Py_UNICODE *e;
6807 int cased;
6808
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 /* Shortcut for single character strings */
6810 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006813 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006814 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 e = p + PyUnicode_GET_SIZE(self);
6818 cased = 0;
6819 for (; p < e; p++) {
6820 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 else if (!cased && Py_UNICODE_ISUPPER(ch))
6825 cased = 1;
6826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006827 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006830PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006831"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006833Return True if S is a titlecased string and there is at least one\n\
6834character in S, i.e. upper- and titlecase characters may only\n\
6835follow uncased characters and lowercase characters only cased ones.\n\
6836Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006839unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
6841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6842 register const Py_UNICODE *e;
6843 int cased, previous_is_cased;
6844
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6848 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006850 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006851 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 e = p + PyUnicode_GET_SIZE(self);
6855 cased = 0;
6856 previous_is_cased = 0;
6857 for (; p < e; p++) {
6858 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6861 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 previous_is_cased = 1;
6864 cased = 1;
6865 }
6866 else if (Py_UNICODE_ISLOWER(ch)) {
6867 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 previous_is_cased = 1;
6870 cased = 1;
6871 }
6872 else
6873 previous_is_cased = 0;
6874 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876}
6877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006878PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006879"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006881Return True if all characters in S are whitespace\n\
6882and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
6884static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006885unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6888 register const Py_UNICODE *e;
6889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 /* Shortcut for single character strings */
6891 if (PyUnicode_GET_SIZE(self) == 1 &&
6892 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006895 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006896 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006898
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 e = p + PyUnicode_GET_SIZE(self);
6900 for (; p < e; p++) {
6901 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905}
6906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006908"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006910Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006911and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006912
6913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006914unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915{
6916 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6917 register const Py_UNICODE *e;
6918
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006919 /* Shortcut for single character strings */
6920 if (PyUnicode_GET_SIZE(self) == 1 &&
6921 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006923
6924 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006925 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006927
6928 e = p + PyUnicode_GET_SIZE(self);
6929 for (; p < e; p++) {
6930 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006931 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006932 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006934}
6935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006937"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006938\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006939Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006940and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006941
6942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006943unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006944{
6945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946 register const Py_UNICODE *e;
6947
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006948 /* Shortcut for single character strings */
6949 if (PyUnicode_GET_SIZE(self) == 1 &&
6950 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006952
6953 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006954 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006956
6957 e = p + PyUnicode_GET_SIZE(self);
6958 for (; p < e; p++) {
6959 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006961 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963}
6964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006965PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006966"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006969False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
6971static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006972unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973{
6974 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6975 register const Py_UNICODE *e;
6976
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 /* Shortcut for single character strings */
6978 if (PyUnicode_GET_SIZE(self) == 1 &&
6979 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006982 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006983 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006985
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 e = p + PyUnicode_GET_SIZE(self);
6987 for (; p < e; p++) {
6988 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992}
6993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006995"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006997Return True if all characters in S are digits\n\
6998and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
7000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007001unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002{
7003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7004 register const Py_UNICODE *e;
7005
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 /* Shortcut for single character strings */
7007 if (PyUnicode_GET_SIZE(self) == 1 &&
7008 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007011 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007012 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007014
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 e = p + PyUnicode_GET_SIZE(self);
7016 for (; p < e; p++) {
7017 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007018 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007020 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021}
7022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007024"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007026Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
7029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7033 register const Py_UNICODE *e;
7034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 /* Shortcut for single character strings */
7036 if (PyUnicode_GET_SIZE(self) == 1 &&
7037 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007040 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007041 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007042 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007043
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 e = p + PyUnicode_GET_SIZE(self);
7045 for (; p < e; p++) {
7046 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007047 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007049 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050}
7051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053"S.join(sequence) -> unicode\n\
7054\n\
7055Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007056sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057
7058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Martin v. Löwis18e16552006-02-15 17:27:45 +00007064static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065unicode_length(PyUnicodeObject *self)
7066{
7067 return self->length;
7068}
7069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007070PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007071"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072\n\
7073Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007074done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
7076static PyObject *
7077unicode_ljust(PyUnicodeObject *self, PyObject *args)
7078{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007079 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007080 Py_UNICODE fillchar = ' ';
7081
Martin v. Löwis412fb672006-04-13 06:34:32 +00007082 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 return NULL;
7084
Tim Peters7a29bd52001-09-12 03:03:31 +00007085 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 Py_INCREF(self);
7087 return (PyObject*) self;
7088 }
7089
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007090 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091}
7092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007093PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094"S.lower() -> unicode\n\
7095\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097
7098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007099unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 return fixup(self, fixlower);
7102}
7103
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104#define LEFTSTRIP 0
7105#define RIGHTSTRIP 1
7106#define BOTHSTRIP 2
7107
7108/* Arrays indexed by above */
7109static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7110
7111#define STRIPNAME(i) (stripformat[i]+3)
7112
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113/* externally visible for str.strip(unicode) */
7114PyObject *
7115_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7116{
7117 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007120 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7121 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007123 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125 i = 0;
7126 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007127 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7128 i++;
7129 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130 }
7131
7132 j = len;
7133 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007134 do {
7135 j--;
7136 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7137 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138 }
7139
7140 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007141 Py_INCREF(self);
7142 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143 }
7144 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007145 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146}
7147
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007154
7155 i = 0;
7156 if (striptype != RIGHTSTRIP) {
7157 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7158 i++;
7159 }
7160 }
7161
7162 j = len;
7163 if (striptype != LEFTSTRIP) {
7164 do {
7165 j--;
7166 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7167 j++;
7168 }
7169
7170 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7171 Py_INCREF(self);
7172 return (PyObject*)self;
7173 }
7174 else
7175 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176}
7177
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178
7179static PyObject *
7180do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7181{
7182 PyObject *sep = NULL;
7183
7184 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7185 return NULL;
7186
7187 if (sep != NULL && sep != Py_None) {
7188 if (PyUnicode_Check(sep))
7189 return _PyUnicode_XStrip(self, striptype, sep);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007190 else if (PyString_Check(sep)) {
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191 PyObject *res;
7192 sep = PyUnicode_FromObject(sep);
7193 if (sep==NULL)
7194 return NULL;
7195 res = _PyUnicode_XStrip(self, striptype, sep);
7196 Py_DECREF(sep);
7197 return res;
7198 }
7199 else {
7200 PyErr_Format(PyExc_TypeError,
7201 "%s arg must be None, unicode or str",
7202 STRIPNAME(striptype));
7203 return NULL;
7204 }
7205 }
7206
7207 return do_strip(self, striptype);
7208}
7209
7210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007212"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213\n\
7214Return a copy of the string S with leading and trailing\n\
7215whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007216If chars is given and not None, remove characters in chars instead.\n\
7217If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007218
7219static PyObject *
7220unicode_strip(PyUnicodeObject *self, PyObject *args)
7221{
7222 if (PyTuple_GET_SIZE(args) == 0)
7223 return do_strip(self, BOTHSTRIP); /* Common case */
7224 else
7225 return do_argstrip(self, BOTHSTRIP, args);
7226}
7227
7228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007229PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007230"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007231\n\
7232Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007233If chars is given and not None, remove characters in chars instead.\n\
7234If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007235
7236static PyObject *
7237unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7238{
7239 if (PyTuple_GET_SIZE(args) == 0)
7240 return do_strip(self, LEFTSTRIP); /* Common case */
7241 else
7242 return do_argstrip(self, LEFTSTRIP, args);
7243}
7244
7245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007246PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007247"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007248\n\
7249Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007250If chars is given and not None, remove characters in chars instead.\n\
7251If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007252
7253static PyObject *
7254unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7255{
7256 if (PyTuple_GET_SIZE(args) == 0)
7257 return do_strip(self, RIGHTSTRIP); /* Common case */
7258 else
7259 return do_argstrip(self, RIGHTSTRIP, args);
7260}
7261
7262
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265{
7266 PyUnicodeObject *u;
7267 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007269 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
7271 if (len < 0)
7272 len = 0;
7273
Tim Peters7a29bd52001-09-12 03:03:31 +00007274 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 /* no repeat, return original string */
7276 Py_INCREF(str);
7277 return (PyObject*) str;
7278 }
Tim Peters8f422462000-09-09 06:13:41 +00007279
7280 /* ensure # of chars needed doesn't overflow int and # of bytes
7281 * needed doesn't overflow size_t
7282 */
7283 nchars = len * str->length;
7284 if (len && nchars / len != str->length) {
7285 PyErr_SetString(PyExc_OverflowError,
7286 "repeated string is too long");
7287 return NULL;
7288 }
7289 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7290 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7291 PyErr_SetString(PyExc_OverflowError,
7292 "repeated string is too long");
7293 return NULL;
7294 }
7295 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 if (!u)
7297 return NULL;
7298
7299 p = u->str;
7300
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007301 if (str->length == 1 && len > 0) {
7302 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007303 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007304 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007305 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007306 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007307 done = str->length;
7308 }
7309 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007310 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007311 Py_UNICODE_COPY(p+done, p, n);
7312 done += n;
7313 }
7314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316 return (PyObject*) u;
7317}
7318
7319PyObject *PyUnicode_Replace(PyObject *obj,
7320 PyObject *subobj,
7321 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007322 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323{
7324 PyObject *self;
7325 PyObject *str1;
7326 PyObject *str2;
7327 PyObject *result;
7328
7329 self = PyUnicode_FromObject(obj);
7330 if (self == NULL)
7331 return NULL;
7332 str1 = PyUnicode_FromObject(subobj);
7333 if (str1 == NULL) {
7334 Py_DECREF(self);
7335 return NULL;
7336 }
7337 str2 = PyUnicode_FromObject(replobj);
7338 if (str2 == NULL) {
7339 Py_DECREF(self);
7340 Py_DECREF(str1);
7341 return NULL;
7342 }
Tim Petersced69f82003-09-16 20:30:58 +00007343 result = replace((PyUnicodeObject *)self,
7344 (PyUnicodeObject *)str1,
7345 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 maxcount);
7347 Py_DECREF(self);
7348 Py_DECREF(str1);
7349 Py_DECREF(str2);
7350 return result;
7351}
7352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007353PyDoc_STRVAR(replace__doc__,
Georg Brandl30fadc12008-05-30 07:54:16 +00007354"S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355\n\
7356Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007357old replaced by new. If the optional argument count is\n\
7358given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360static PyObject*
7361unicode_replace(PyUnicodeObject *self, PyObject *args)
7362{
7363 PyUnicodeObject *str1;
7364 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007365 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 PyObject *result;
7367
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 return NULL;
7370 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7371 if (str1 == NULL)
7372 return NULL;
7373 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007374 if (str2 == NULL) {
7375 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007377 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
7379 result = replace(self, str1, str2, maxcount);
7380
7381 Py_DECREF(str1);
7382 Py_DECREF(str2);
7383 return result;
7384}
7385
7386static
7387PyObject *unicode_repr(PyObject *unicode)
7388{
7389 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7390 PyUnicode_GET_SIZE(unicode),
7391 1);
7392}
7393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007394PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395"S.rfind(sub [,start [,end]]) -> int\n\
7396\n\
7397Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007398such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399arguments start and end are interpreted as in slice notation.\n\
7400\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007401Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
7403static PyObject *
7404unicode_rfind(PyUnicodeObject *self, PyObject *args)
7405{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007406 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007407 Py_ssize_t start;
7408 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007409 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
Facundo Batista57d56692007-11-16 18:04:14 +00007411 if (!_ParseTupleFinds(args, &substring, &start, &end))
7412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007414 result = stringlib_rfind_slice(
7415 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7416 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7417 start, end
7418 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
7420 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007421
7422 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423}
7424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007425PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426"S.rindex(sub [,start [,end]]) -> int\n\
7427\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430static PyObject *
7431unicode_rindex(PyUnicodeObject *self, PyObject *args)
7432{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007433 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007434 Py_ssize_t start;
7435 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007436 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
Facundo Batista57d56692007-11-16 18:04:14 +00007438 if (!_ParseTupleFinds(args, &substring, &start, &end))
7439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007441 result = stringlib_rfind_slice(
7442 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7443 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7444 start, end
7445 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007448
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 if (result < 0) {
7450 PyErr_SetString(PyExc_ValueError, "substring not found");
7451 return NULL;
7452 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007453 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454}
7455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007456PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007457"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458\n\
7459Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007460done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
7462static PyObject *
7463unicode_rjust(PyUnicodeObject *self, PyObject *args)
7464{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007465 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007466 Py_UNICODE fillchar = ' ';
7467
Martin v. Löwis412fb672006-04-13 06:34:32 +00007468 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 return NULL;
7470
Tim Peters7a29bd52001-09-12 03:03:31 +00007471 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 Py_INCREF(self);
7473 return (PyObject*) self;
7474 }
7475
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007476 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007480unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481{
7482 /* standard clamping */
7483 if (start < 0)
7484 start = 0;
7485 if (end < 0)
7486 end = 0;
7487 if (end > self->length)
7488 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007489 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 /* full slice, return original string */
7491 Py_INCREF(self);
7492 return (PyObject*) self;
7493 }
7494 if (start > end)
7495 start = end;
7496 /* copy slice */
7497 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7498 end - start);
7499}
7500
7501PyObject *PyUnicode_Split(PyObject *s,
7502 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504{
7505 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007506
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 s = PyUnicode_FromObject(s);
7508 if (s == NULL)
7509 return NULL;
7510 if (sep != NULL) {
7511 sep = PyUnicode_FromObject(sep);
7512 if (sep == NULL) {
7513 Py_DECREF(s);
7514 return NULL;
7515 }
7516 }
7517
7518 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7519
7520 Py_DECREF(s);
7521 Py_XDECREF(sep);
7522 return result;
7523}
7524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526"S.split([sep [,maxsplit]]) -> list of strings\n\
7527\n\
7528Return a list of the words in S, using sep as the\n\
7529delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007530splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007531whitespace string is a separator and empty strings are\n\
7532removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534static PyObject*
7535unicode_split(PyUnicodeObject *self, PyObject *args)
7536{
7537 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 return NULL;
7542
7543 if (substring == Py_None)
7544 return split(self, NULL, maxcount);
7545 else if (PyUnicode_Check(substring))
7546 return split(self, (PyUnicodeObject *)substring, maxcount);
7547 else
7548 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7549}
7550
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007551PyObject *
7552PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7553{
7554 PyObject* str_obj;
7555 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007556 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007557
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007558 str_obj = PyUnicode_FromObject(str_in);
7559 if (!str_obj)
7560 return NULL;
7561 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007562 if (!sep_obj) {
7563 Py_DECREF(str_obj);
7564 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007565 }
7566
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007567 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007568 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7569 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7570 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007571
Fredrik Lundhb9479482006-05-26 17:22:38 +00007572 Py_DECREF(sep_obj);
7573 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007574
7575 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007576}
7577
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007578
7579PyObject *
7580PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7581{
7582 PyObject* str_obj;
7583 PyObject* sep_obj;
7584 PyObject* out;
7585
7586 str_obj = PyUnicode_FromObject(str_in);
7587 if (!str_obj)
7588 return NULL;
7589 sep_obj = PyUnicode_FromObject(sep_in);
7590 if (!sep_obj) {
7591 Py_DECREF(str_obj);
7592 return NULL;
7593 }
7594
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007595 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007596 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7597 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7598 );
7599
7600 Py_DECREF(sep_obj);
7601 Py_DECREF(str_obj);
7602
7603 return out;
7604}
7605
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007606PyDoc_STRVAR(partition__doc__,
7607"S.partition(sep) -> (head, sep, tail)\n\
7608\n\
7609Searches for the separator sep in S, and returns the part before it,\n\
7610the separator itself, and the part after it. If the separator is not\n\
7611found, returns S and two empty strings.");
7612
7613static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007614unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007615{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007616 return PyUnicode_Partition((PyObject *)self, separator);
7617}
7618
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007619PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007620"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007621\n\
7622Searches for the separator sep in S, starting at the end of S, and returns\n\
7623the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007624separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007625
7626static PyObject*
7627unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7628{
7629 return PyUnicode_RPartition((PyObject *)self, separator);
7630}
7631
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007632PyObject *PyUnicode_RSplit(PyObject *s,
7633 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007634 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007635{
7636 PyObject *result;
7637
7638 s = PyUnicode_FromObject(s);
7639 if (s == NULL)
7640 return NULL;
7641 if (sep != NULL) {
7642 sep = PyUnicode_FromObject(sep);
7643 if (sep == NULL) {
7644 Py_DECREF(s);
7645 return NULL;
7646 }
7647 }
7648
7649 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7650
7651 Py_DECREF(s);
7652 Py_XDECREF(sep);
7653 return result;
7654}
7655
7656PyDoc_STRVAR(rsplit__doc__,
7657"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7658\n\
7659Return a list of the words in S, using sep as the\n\
7660delimiter string, starting at the end of the string and\n\
7661working to the front. If maxsplit is given, at most maxsplit\n\
7662splits are done. If sep is not specified, any whitespace string\n\
7663is a separator.");
7664
7665static PyObject*
7666unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7667{
7668 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007670
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007672 return NULL;
7673
7674 if (substring == Py_None)
7675 return rsplit(self, NULL, maxcount);
7676 else if (PyUnicode_Check(substring))
7677 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7678 else
7679 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7680}
7681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007683"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684\n\
7685Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007686Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007687is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688
7689static PyObject*
7690unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7691{
Guido van Rossum86662912000-04-11 15:38:46 +00007692 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
Guido van Rossum86662912000-04-11 15:38:46 +00007694 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 return NULL;
7696
Guido van Rossum86662912000-04-11 15:38:46 +00007697 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698}
7699
7700static
7701PyObject *unicode_str(PyUnicodeObject *self)
7702{
Fred Drakee4315f52000-05-09 19:53:39 +00007703 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704}
7705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707"S.swapcase() -> unicode\n\
7708\n\
7709Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
7712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007713unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 return fixup(self, fixswapcase);
7716}
7717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719"S.translate(table) -> unicode\n\
7720\n\
7721Return a copy of the string S, where all characters have been mapped\n\
7722through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007723Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7724Unmapped characters are left untouched. Characters mapped to None\n\
7725are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
7727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007728unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729{
Tim Petersced69f82003-09-16 20:30:58 +00007730 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007732 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 "ignore");
7734}
7735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007736PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737"S.upper() -> unicode\n\
7738\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007739Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
7741static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007742unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 return fixup(self, fixupper);
7745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748"S.zfill(width) -> unicode\n\
7749\n\
Georg Brandl98064072008-09-09 19:26:00 +00007750Pad a numeric string S with zeros on the left, to fill a field\n\
7751of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject *
7754unicode_zfill(PyUnicodeObject *self, PyObject *args)
7755{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007756 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 PyUnicodeObject *u;
7758
Martin v. Löwis18e16552006-02-15 17:27:45 +00007759 Py_ssize_t width;
7760 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 return NULL;
7762
7763 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007764 if (PyUnicode_CheckExact(self)) {
7765 Py_INCREF(self);
7766 return (PyObject*) self;
7767 }
7768 else
7769 return PyUnicode_FromUnicode(
7770 PyUnicode_AS_UNICODE(self),
7771 PyUnicode_GET_SIZE(self)
7772 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 }
7774
7775 fill = width - self->length;
7776
7777 u = pad(self, fill, 0, '0');
7778
Walter Dörwald068325e2002-04-15 13:36:47 +00007779 if (u == NULL)
7780 return NULL;
7781
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 if (u->str[fill] == '+' || u->str[fill] == '-') {
7783 /* move sign to beginning of string */
7784 u->str[0] = u->str[fill];
7785 u->str[fill] = '0';
7786 }
7787
7788 return (PyObject*) u;
7789}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
7791#if 0
7792static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007793free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007795 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796}
7797#endif
7798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007800"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007802Return True if S starts with the specified prefix, False otherwise.\n\
7803With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007804With optional end, stop comparing S at that position.\n\
7805prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806
7807static PyObject *
7808unicode_startswith(PyUnicodeObject *self,
7809 PyObject *args)
7810{
Georg Brandl24250812006-06-09 18:45:48 +00007811 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007813 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007814 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007815 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816
Georg Brandl24250812006-06-09 18:45:48 +00007817 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007818 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007820 if (PyTuple_Check(subobj)) {
7821 Py_ssize_t i;
7822 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7823 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7824 PyTuple_GET_ITEM(subobj, i));
7825 if (substring == NULL)
7826 return NULL;
7827 result = tailmatch(self, substring, start, end, -1);
7828 Py_DECREF(substring);
7829 if (result) {
7830 Py_RETURN_TRUE;
7831 }
7832 }
7833 /* nothing matched */
7834 Py_RETURN_FALSE;
7835 }
7836 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007838 return NULL;
7839 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007841 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842}
7843
7844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007845PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007846"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007848Return True if S ends with the specified suffix, False otherwise.\n\
7849With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007850With optional end, stop comparing S at that position.\n\
7851suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject *
7854unicode_endswith(PyUnicodeObject *self,
7855 PyObject *args)
7856{
Georg Brandl24250812006-06-09 18:45:48 +00007857 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007859 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007860 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007861 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
Georg Brandl24250812006-06-09 18:45:48 +00007863 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7864 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007866 if (PyTuple_Check(subobj)) {
7867 Py_ssize_t i;
7868 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7869 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7870 PyTuple_GET_ITEM(subobj, i));
7871 if (substring == NULL)
7872 return NULL;
7873 result = tailmatch(self, substring, start, end, +1);
7874 Py_DECREF(substring);
7875 if (result) {
7876 Py_RETURN_TRUE;
7877 }
7878 }
7879 Py_RETURN_FALSE;
7880 }
7881 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884
Georg Brandl24250812006-06-09 18:45:48 +00007885 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007887 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888}
7889
7890
Eric Smitha9f7d622008-02-17 19:46:49 +00007891/* Implements do_string_format, which is unicode because of stringlib */
7892#include "stringlib/string_format.h"
7893
7894PyDoc_STRVAR(format__doc__,
7895"S.format(*args, **kwargs) -> unicode\n\
7896\n\
7897");
7898
Eric Smithdc13b792008-05-30 18:10:04 +00007899static PyObject *
7900unicode__format__(PyObject *self, PyObject *args)
7901{
7902 PyObject *format_spec;
7903 PyObject *result = NULL;
7904 PyObject *tmp = NULL;
7905
7906 /* If 2.x, convert format_spec to the same type as value */
7907 /* This is to allow things like u''.format('') */
7908 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7909 goto done;
7910 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7911 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7912 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7913 goto done;
7914 }
7915 tmp = PyObject_Unicode(format_spec);
7916 if (tmp == NULL)
7917 goto done;
7918 format_spec = tmp;
7919
7920 result = _PyUnicode_FormatAdvanced(self,
7921 PyUnicode_AS_UNICODE(format_spec),
7922 PyUnicode_GET_SIZE(format_spec));
7923done:
7924 Py_XDECREF(tmp);
7925 return result;
7926}
7927
Eric Smitha9f7d622008-02-17 19:46:49 +00007928PyDoc_STRVAR(p_format__doc__,
7929"S.__format__(format_spec) -> unicode\n\
7930\n\
7931");
7932
Robert Schuppenies901c9972008-06-10 10:10:31 +00007933static PyObject *
7934unicode__sizeof__(PyUnicodeObject *v)
7935{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007936 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7937 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007938}
7939
7940PyDoc_STRVAR(sizeof__doc__,
7941"S.__sizeof__() -> size of S in memory, in bytes\n\
7942\n\
7943");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007944
7945static PyObject *
7946unicode_getnewargs(PyUnicodeObject *v)
7947{
7948 return Py_BuildValue("(u#)", v->str, v->length);
7949}
7950
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952static PyMethodDef unicode_methods[] = {
7953
7954 /* Order is according to common usage: often used methods should
7955 appear first, since lookup is done sequentially. */
7956
Georg Brandlecdc0a92006-03-30 12:19:07 +00007957 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007958 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7959 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007960 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007961 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7962 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7963 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7964 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7965 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7966 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7967 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007968 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007969 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7970 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7971 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007972 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007973 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007974/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7975 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7976 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7977 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007978 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007979 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007981 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7983 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7984 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7985 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7986 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7987 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7988 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7989 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7990 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7991 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7992 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7993 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7994 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7995 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007996 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007997 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7998 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7999 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8000 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008001 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008002#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008003 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004#endif
8005
8006#if 0
8007 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008008 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009#endif
8010
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008011 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 {NULL, NULL}
8013};
8014
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008015static PyObject *
8016unicode_mod(PyObject *v, PyObject *w)
8017{
8018 if (!PyUnicode_Check(v)) {
8019 Py_INCREF(Py_NotImplemented);
8020 return Py_NotImplemented;
8021 }
8022 return PyUnicode_Format(v, w);
8023}
8024
8025static PyNumberMethods unicode_as_number = {
8026 0, /*nb_add*/
8027 0, /*nb_subtract*/
8028 0, /*nb_multiply*/
8029 0, /*nb_divide*/
8030 unicode_mod, /*nb_remainder*/
8031};
8032
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00008035 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008036 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8037 (ssizeargfunc) unicode_getitem, /* sq_item */
8038 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 0, /* sq_ass_item */
8040 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00008041 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042};
8043
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008044static PyObject*
8045unicode_subscript(PyUnicodeObject* self, PyObject* item)
8046{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008047 if (PyIndex_Check(item)) {
8048 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008049 if (i == -1 && PyErr_Occurred())
8050 return NULL;
8051 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008052 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008053 return unicode_getitem(self, i);
8054 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008056 Py_UNICODE* source_buf;
8057 Py_UNICODE* result_buf;
8058 PyObject* result;
8059
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008060 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008061 &start, &stop, &step, &slicelength) < 0) {
8062 return NULL;
8063 }
8064
8065 if (slicelength <= 0) {
8066 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008067 } else if (start == 0 && step == 1 && slicelength == self->length &&
8068 PyUnicode_CheckExact(self)) {
8069 Py_INCREF(self);
8070 return (PyObject *)self;
8071 } else if (step == 1) {
8072 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008073 } else {
8074 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008075 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8076 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008077
8078 if (result_buf == NULL)
8079 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008080
8081 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8082 result_buf[i] = source_buf[cur];
8083 }
Tim Petersced69f82003-09-16 20:30:58 +00008084
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008085 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008086 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087 return result;
8088 }
8089 } else {
8090 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8091 return NULL;
8092 }
8093}
8094
8095static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008097 (binaryfunc)unicode_subscript, /* mp_subscript */
8098 (objobjargproc)0, /* mp_ass_subscript */
8099};
8100
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 const void **ptr)
8105{
8106 if (index != 0) {
8107 PyErr_SetString(PyExc_SystemError,
8108 "accessing non-existent unicode segment");
8109 return -1;
8110 }
8111 *ptr = (void *) self->str;
8112 return PyUnicode_GET_DATA_SIZE(self);
8113}
8114
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115static Py_ssize_t
8116unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 const void **ptr)
8118{
8119 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008120 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 return -1;
8122}
8123
8124static int
8125unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008126 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127{
8128 if (lenp)
8129 *lenp = PyUnicode_GET_DATA_SIZE(self);
8130 return 1;
8131}
8132
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008133static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008135 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 const void **ptr)
8137{
8138 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008139
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 if (index != 0) {
8141 PyErr_SetString(PyExc_SystemError,
8142 "accessing non-existent unicode segment");
8143 return -1;
8144 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008145 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 if (str == NULL)
8147 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008148 *ptr = (void *) PyString_AS_STRING(str);
8149 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150}
8151
8152/* Helpers for PyUnicode_Format() */
8153
8154static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008155getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 if (argidx < arglen) {
8159 (*p_argidx)++;
8160 if (arglen < 0)
8161 return args;
8162 else
8163 return PyTuple_GetItem(args, argidx);
8164 }
8165 PyErr_SetString(PyExc_TypeError,
8166 "not enough arguments for format string");
8167 return NULL;
8168}
8169
8170#define F_LJUST (1<<0)
8171#define F_SIGN (1<<1)
8172#define F_BLANK (1<<2)
8173#define F_ALT (1<<3)
8174#define F_ZERO (1<<4)
8175
Martin v. Löwis18e16552006-02-15 17:27:45 +00008176static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008177strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008179 register Py_ssize_t i;
8180 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 for (i = len - 1; i >= 0; i--)
8182 buffer[i] = (Py_UNICODE) charbuffer[i];
8183
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 return len;
8185}
8186
Neal Norwitzfc76d632006-01-10 06:03:13 +00008187static int
8188doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8189{
Tim Peters15231542006-02-16 01:08:01 +00008190 Py_ssize_t result;
8191
Neal Norwitzfc76d632006-01-10 06:03:13 +00008192 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008193 result = strtounicode(buffer, (char *)buffer);
8194 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008195}
8196
8197static int
8198longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8199{
Tim Peters15231542006-02-16 01:08:01 +00008200 Py_ssize_t result;
8201
Neal Norwitzfc76d632006-01-10 06:03:13 +00008202 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008203 result = strtounicode(buffer, (char *)buffer);
8204 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008205}
8206
Guido van Rossum078151d2002-08-11 04:24:12 +00008207/* XXX To save some code duplication, formatfloat/long/int could have been
8208 shared with stringobject.c, converting from 8-bit to Unicode after the
8209 formatting is done. */
8210
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211static int
8212formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008213 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214 int flags,
8215 int prec,
8216 int type,
8217 PyObject *v)
8218{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008219 /* fmt = '%#.' + `prec` + `type`
8220 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 char fmt[20];
8222 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008223
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 x = PyFloat_AsDouble(v);
8225 if (x == -1.0 && PyErr_Occurred())
8226 return -1;
8227 if (prec < 0)
8228 prec = 6;
Eric Smithd6c393a2008-07-17 19:49:47 +00008229 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8230 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008231 /* Worst case length calc to ensure no buffer overrun:
8232
8233 'g' formats:
8234 fmt = %#.<prec>g
8235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8236 for any double rep.)
8237 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8238
8239 'f' formats:
8240 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8241 len = 1 + 50 + 1 + prec = 52 + prec
8242
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008244 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008245
8246 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008247 if (((type == 'g' || type == 'G') &&
8248 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smithd6c393a2008-07-17 19:49:47 +00008249 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008251 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008252 return -1;
8253 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008254 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8255 (flags&F_ALT) ? "#" : "",
8256 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008257 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258}
8259
Tim Peters38fd5b62000-09-21 05:43:11 +00008260static PyObject*
8261formatlong(PyObject *val, int flags, int prec, int type)
8262{
8263 char *buf;
8264 int i, len;
8265 PyObject *str; /* temporary string object. */
8266 PyUnicodeObject *result;
8267
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008268 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008269 if (!str)
8270 return NULL;
8271 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008272 if (!result) {
8273 Py_DECREF(str);
8274 return NULL;
8275 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008276 for (i = 0; i < len; i++)
8277 result->str[i] = buf[i];
8278 result->str[len] = 0;
8279 Py_DECREF(str);
8280 return (PyObject*)result;
8281}
8282
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283static int
8284formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008285 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 int flags,
8287 int prec,
8288 int type,
8289 PyObject *v)
8290{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008291 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008292 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8293 * + 1 + 1
8294 * = 24
8295 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008296 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008297 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 long x;
8299
8300 x = PyInt_AsLong(v);
8301 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008302 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008303 if (x < 0 && type == 'u') {
8304 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008305 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8307 sign = "-";
8308 else
8309 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311 prec = 1;
8312
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008313 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8314 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008316 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008317 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008318 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008319 return -1;
8320 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008321
8322 if ((flags & F_ALT) &&
8323 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008324 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008325 * of issues that cause pain:
8326 * - when 0 is being converted, the C standard leaves off
8327 * the '0x' or '0X', which is inconsistent with other
8328 * %#x/%#X conversions and inconsistent with Python's
8329 * hex() function
8330 * - there are platforms that violate the standard and
8331 * convert 0 with the '0x' or '0X'
8332 * (Metrowerks, Compaq Tru64)
8333 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008334 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008335 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008336 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008337 * We can achieve the desired consistency by inserting our
8338 * own '0x' or '0X' prefix, and substituting %x/%X in place
8339 * of %#x/%#X.
8340 *
8341 * Note that this is the same approach as used in
8342 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008343 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008344 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8345 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008346 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008347 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008348 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8349 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008350 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008351 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008352 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008353 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008355 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
8358static int
8359formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008360 size_t buflen,
8361 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008363 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008364 if (PyUnicode_Check(v)) {
8365 if (PyUnicode_GET_SIZE(v) != 1)
8366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008370 else if (PyString_Check(v)) {
8371 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008372 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008373 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375
8376 else {
8377 /* Integer input truncated to a character */
8378 long x;
8379 x = PyInt_AsLong(v);
8380 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008381 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008382#ifdef Py_UNICODE_WIDE
8383 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008384 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008385 "%c arg not in range(0x110000) "
8386 "(wide Python build)");
8387 return -1;
8388 }
8389#else
8390 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008391 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008392 "%c arg not in range(0x10000) "
8393 "(narrow Python build)");
8394 return -1;
8395 }
8396#endif
8397 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 }
8399 buf[1] = '\0';
8400 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008401
8402 onError:
8403 PyErr_SetString(PyExc_TypeError,
8404 "%c requires int or char");
8405 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406}
8407
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008408/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8409
8410 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8411 chars are formatted. XXX This is a magic number. Each formatting
8412 routine does bounds checking to ensure no overflow, but a better
8413 solution may be to malloc a buffer of appropriate size for each
8414 format. For now, the current solution is sufficient.
8415*/
8416#define FORMATBUFLEN (size_t)120
8417
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418PyObject *PyUnicode_Format(PyObject *format,
8419 PyObject *args)
8420{
8421 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008422 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 int args_owned = 0;
8424 PyUnicodeObject *result = NULL;
8425 PyObject *dict = NULL;
8426 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008427
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 if (format == NULL || args == NULL) {
8429 PyErr_BadInternalCall();
8430 return NULL;
8431 }
8432 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008433 if (uformat == NULL)
8434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 fmt = PyUnicode_AS_UNICODE(uformat);
8436 fmtcnt = PyUnicode_GET_SIZE(uformat);
8437
8438 reslen = rescnt = fmtcnt + 100;
8439 result = _PyUnicode_New(reslen);
8440 if (result == NULL)
8441 goto onError;
8442 res = PyUnicode_AS_UNICODE(result);
8443
8444 if (PyTuple_Check(args)) {
8445 arglen = PyTuple_Size(args);
8446 argidx = 0;
8447 }
8448 else {
8449 arglen = -1;
8450 argidx = -2;
8451 }
Christian Heimese93237d2007-12-19 02:37:44 +00008452 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008453 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 dict = args;
8455
8456 while (--fmtcnt >= 0) {
8457 if (*fmt != '%') {
8458 if (--rescnt < 0) {
8459 rescnt = fmtcnt + 100;
8460 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008461 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8464 --rescnt;
8465 }
8466 *res++ = *fmt++;
8467 }
8468 else {
8469 /* Got a format specifier */
8470 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008471 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 Py_UNICODE c = '\0';
8474 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008475 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 PyObject *v = NULL;
8477 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008478 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008480 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008481 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482
8483 fmt++;
8484 if (*fmt == '(') {
8485 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008486 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 PyObject *key;
8488 int pcount = 1;
8489
8490 if (dict == NULL) {
8491 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008492 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 goto onError;
8494 }
8495 ++fmt;
8496 --fmtcnt;
8497 keystart = fmt;
8498 /* Skip over balanced parentheses */
8499 while (pcount > 0 && --fmtcnt >= 0) {
8500 if (*fmt == ')')
8501 --pcount;
8502 else if (*fmt == '(')
8503 ++pcount;
8504 fmt++;
8505 }
8506 keylen = fmt - keystart - 1;
8507 if (fmtcnt < 0 || pcount > 0) {
8508 PyErr_SetString(PyExc_ValueError,
8509 "incomplete format key");
8510 goto onError;
8511 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008512#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008513 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 then looked up since Python uses strings to hold
8515 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008516 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 key = PyUnicode_EncodeUTF8(keystart,
8518 keylen,
8519 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008520#else
8521 key = PyUnicode_FromUnicode(keystart, keylen);
8522#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 if (key == NULL)
8524 goto onError;
8525 if (args_owned) {
8526 Py_DECREF(args);
8527 args_owned = 0;
8528 }
8529 args = PyObject_GetItem(dict, key);
8530 Py_DECREF(key);
8531 if (args == NULL) {
8532 goto onError;
8533 }
8534 args_owned = 1;
8535 arglen = -1;
8536 argidx = -2;
8537 }
8538 while (--fmtcnt >= 0) {
8539 switch (c = *fmt++) {
8540 case '-': flags |= F_LJUST; continue;
8541 case '+': flags |= F_SIGN; continue;
8542 case ' ': flags |= F_BLANK; continue;
8543 case '#': flags |= F_ALT; continue;
8544 case '0': flags |= F_ZERO; continue;
8545 }
8546 break;
8547 }
8548 if (c == '*') {
8549 v = getnextarg(args, arglen, &argidx);
8550 if (v == NULL)
8551 goto onError;
8552 if (!PyInt_Check(v)) {
8553 PyErr_SetString(PyExc_TypeError,
8554 "* wants int");
8555 goto onError;
8556 }
8557 width = PyInt_AsLong(v);
8558 if (width < 0) {
8559 flags |= F_LJUST;
8560 width = -width;
8561 }
8562 if (--fmtcnt >= 0)
8563 c = *fmt++;
8564 }
8565 else if (c >= '0' && c <= '9') {
8566 width = c - '0';
8567 while (--fmtcnt >= 0) {
8568 c = *fmt++;
8569 if (c < '0' || c > '9')
8570 break;
8571 if ((width*10) / 10 != width) {
8572 PyErr_SetString(PyExc_ValueError,
8573 "width too big");
8574 goto onError;
8575 }
8576 width = width*10 + (c - '0');
8577 }
8578 }
8579 if (c == '.') {
8580 prec = 0;
8581 if (--fmtcnt >= 0)
8582 c = *fmt++;
8583 if (c == '*') {
8584 v = getnextarg(args, arglen, &argidx);
8585 if (v == NULL)
8586 goto onError;
8587 if (!PyInt_Check(v)) {
8588 PyErr_SetString(PyExc_TypeError,
8589 "* wants int");
8590 goto onError;
8591 }
8592 prec = PyInt_AsLong(v);
8593 if (prec < 0)
8594 prec = 0;
8595 if (--fmtcnt >= 0)
8596 c = *fmt++;
8597 }
8598 else if (c >= '0' && c <= '9') {
8599 prec = c - '0';
8600 while (--fmtcnt >= 0) {
8601 c = Py_CHARMASK(*fmt++);
8602 if (c < '0' || c > '9')
8603 break;
8604 if ((prec*10) / 10 != prec) {
8605 PyErr_SetString(PyExc_ValueError,
8606 "prec too big");
8607 goto onError;
8608 }
8609 prec = prec*10 + (c - '0');
8610 }
8611 }
8612 } /* prec */
8613 if (fmtcnt >= 0) {
8614 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 if (--fmtcnt >= 0)
8616 c = *fmt++;
8617 }
8618 }
8619 if (fmtcnt < 0) {
8620 PyErr_SetString(PyExc_ValueError,
8621 "incomplete format");
8622 goto onError;
8623 }
8624 if (c != '%') {
8625 v = getnextarg(args, arglen, &argidx);
8626 if (v == NULL)
8627 goto onError;
8628 }
8629 sign = 0;
8630 fill = ' ';
8631 switch (c) {
8632
8633 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008634 pbuf = formatbuf;
8635 /* presume that buffer length is at least 1 */
8636 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 len = 1;
8638 break;
8639
8640 case 's':
8641 case 'r':
8642 if (PyUnicode_Check(v) && c == 's') {
8643 temp = v;
8644 Py_INCREF(temp);
8645 }
8646 else {
8647 PyObject *unicode;
8648 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008649 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 else
8651 temp = PyObject_Repr(v);
8652 if (temp == NULL)
8653 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008654 if (PyUnicode_Check(temp))
8655 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008656 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008657 /* convert to string to Unicode */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008658 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8659 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008660 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008662 Py_DECREF(temp);
8663 temp = unicode;
8664 if (temp == NULL)
8665 goto onError;
8666 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008667 else {
8668 Py_DECREF(temp);
8669 PyErr_SetString(PyExc_TypeError,
8670 "%s argument has non-string str()");
8671 goto onError;
8672 }
8673 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008674 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 len = PyUnicode_GET_SIZE(temp);
8676 if (prec >= 0 && len > prec)
8677 len = prec;
8678 break;
8679
8680 case 'i':
8681 case 'd':
8682 case 'u':
8683 case 'o':
8684 case 'x':
8685 case 'X':
8686 if (c == 'i')
8687 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008688 isnumok = 0;
8689 if (PyNumber_Check(v)) {
8690 PyObject *iobj=NULL;
8691
8692 if (PyInt_Check(v) || (PyLong_Check(v))) {
8693 iobj = v;
8694 Py_INCREF(iobj);
8695 }
8696 else {
8697 iobj = PyNumber_Int(v);
8698 if (iobj==NULL) iobj = PyNumber_Long(v);
8699 }
8700 if (iobj!=NULL) {
8701 if (PyInt_Check(iobj)) {
8702 isnumok = 1;
8703 pbuf = formatbuf;
8704 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8705 flags, prec, c, iobj);
8706 Py_DECREF(iobj);
8707 if (len < 0)
8708 goto onError;
8709 sign = 1;
8710 }
8711 else if (PyLong_Check(iobj)) {
8712 isnumok = 1;
8713 temp = formatlong(iobj, flags, prec, c);
8714 Py_DECREF(iobj);
8715 if (!temp)
8716 goto onError;
8717 pbuf = PyUnicode_AS_UNICODE(temp);
8718 len = PyUnicode_GET_SIZE(temp);
8719 sign = 1;
8720 }
8721 else {
8722 Py_DECREF(iobj);
8723 }
8724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008726 if (!isnumok) {
8727 PyErr_Format(PyExc_TypeError,
8728 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008729 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008730 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008731 }
8732 if (flags & F_ZERO)
8733 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 break;
8735
8736 case 'e':
8737 case 'E':
8738 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008739 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 case 'g':
8741 case 'G':
Eric Smithd6c393a2008-07-17 19:49:47 +00008742 if (c == 'F')
8743 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008744 pbuf = formatbuf;
8745 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8746 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 if (len < 0)
8748 goto onError;
8749 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008750 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 fill = '0';
8752 break;
8753
8754 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008755 pbuf = formatbuf;
8756 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 if (len < 0)
8758 goto onError;
8759 break;
8760
8761 default:
8762 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008763 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008764 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008765 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008766 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008767 (Py_ssize_t)(fmt - 1 -
8768 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 goto onError;
8770 }
8771 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008772 if (*pbuf == '-' || *pbuf == '+') {
8773 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 len--;
8775 }
8776 else if (flags & F_SIGN)
8777 sign = '+';
8778 else if (flags & F_BLANK)
8779 sign = ' ';
8780 else
8781 sign = 0;
8782 }
8783 if (width < len)
8784 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008785 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 reslen -= rescnt;
8787 rescnt = width + fmtcnt + 100;
8788 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008789 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008790 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008791 PyErr_NoMemory();
8792 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008793 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008794 if (_PyUnicode_Resize(&result, reslen) < 0) {
8795 Py_XDECREF(temp);
8796 goto onError;
8797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 res = PyUnicode_AS_UNICODE(result)
8799 + reslen - rescnt;
8800 }
8801 if (sign) {
8802 if (fill != ' ')
8803 *res++ = sign;
8804 rescnt--;
8805 if (width > len)
8806 width--;
8807 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008808 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8809 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008810 assert(pbuf[1] == c);
8811 if (fill != ' ') {
8812 *res++ = *pbuf++;
8813 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008814 }
Tim Petersfff53252001-04-12 18:38:48 +00008815 rescnt -= 2;
8816 width -= 2;
8817 if (width < 0)
8818 width = 0;
8819 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 if (width > len && !(flags & F_LJUST)) {
8822 do {
8823 --rescnt;
8824 *res++ = fill;
8825 } while (--width > len);
8826 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008827 if (fill == ' ') {
8828 if (sign)
8829 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008830 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008831 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008832 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008833 *res++ = *pbuf++;
8834 *res++ = *pbuf++;
8835 }
8836 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008837 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 res += len;
8839 rescnt -= len;
8840 while (--width >= len) {
8841 --rescnt;
8842 *res++ = ' ';
8843 }
8844 if (dict && (argidx < arglen) && c != '%') {
8845 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008846 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008847 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 goto onError;
8849 }
8850 Py_XDECREF(temp);
8851 } /* '%' */
8852 } /* until end */
8853 if (argidx < arglen && !dict) {
8854 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008855 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 goto onError;
8857 }
8858
Thomas Woutersa96affe2006-03-12 00:29:36 +00008859 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 if (args_owned) {
8862 Py_DECREF(args);
8863 }
8864 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 return (PyObject *)result;
8866
8867 onError:
8868 Py_XDECREF(result);
8869 Py_DECREF(uformat);
8870 if (args_owned) {
8871 Py_DECREF(args);
8872 }
8873 return NULL;
8874}
8875
8876static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 (readbufferproc) unicode_buffer_getreadbuf,
8878 (writebufferproc) unicode_buffer_getwritebuf,
8879 (segcountproc) unicode_buffer_getsegcount,
8880 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881};
8882
Jeremy Hylton938ace62002-07-17 16:30:39 +00008883static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008884unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8885
Tim Peters6d6c1a32001-08-02 04:15:00 +00008886static PyObject *
8887unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8888{
8889 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008890 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008891 char *encoding = NULL;
8892 char *errors = NULL;
8893
Guido van Rossume023fe02001-08-30 03:12:59 +00008894 if (type != &PyUnicode_Type)
8895 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008896 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8897 kwlist, &x, &encoding, &errors))
8898 return NULL;
8899 if (x == NULL)
8900 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008901 if (encoding == NULL && errors == NULL)
8902 return PyObject_Unicode(x);
8903 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008904 return PyUnicode_FromEncodedObject(x, encoding, errors);
8905}
8906
Guido van Rossume023fe02001-08-30 03:12:59 +00008907static PyObject *
8908unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8909{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008910 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008911 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008912
8913 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8914 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8915 if (tmp == NULL)
8916 return NULL;
8917 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008918 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008919 if (pnew == NULL) {
8920 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008921 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008922 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008923 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008924 if (pnew->str == NULL) {
8925 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008926 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008927 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008928 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008929 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008930 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8931 pnew->length = n;
8932 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008933 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008934 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008935}
8936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008937PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008938"unicode(string [, encoding[, errors]]) -> object\n\
8939\n\
8940Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008941encoding defaults to the current default string encoding.\n\
8942errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008945 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 "unicode", /* tp_name */
8947 sizeof(PyUnicodeObject), /* tp_size */
8948 0, /* tp_itemsize */
8949 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008950 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008952 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008954 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008955 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008956 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008958 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 (hashfunc) unicode_hash, /* tp_hash*/
8960 0, /* tp_call*/
8961 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008962 PyObject_GenericGetAttr, /* tp_getattro */
8963 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008965 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008966 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008967 unicode_doc, /* tp_doc */
8968 0, /* tp_traverse */
8969 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008970 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008971 0, /* tp_weaklistoffset */
8972 0, /* tp_iter */
8973 0, /* tp_iternext */
8974 unicode_methods, /* tp_methods */
8975 0, /* tp_members */
8976 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008977 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008978 0, /* tp_dict */
8979 0, /* tp_descr_get */
8980 0, /* tp_descr_set */
8981 0, /* tp_dictoffset */
8982 0, /* tp_init */
8983 0, /* tp_alloc */
8984 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008985 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986};
8987
8988/* Initialize the Unicode implementation */
8989
Thomas Wouters78890102000-07-22 19:25:51 +00008990void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008992 int i;
8993
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008994 /* XXX - move this array to unicodectype.c ? */
8995 Py_UNICODE linebreak[] = {
8996 0x000A, /* LINE FEED */
8997 0x000D, /* CARRIAGE RETURN */
8998 0x001C, /* FILE SEPARATOR */
8999 0x001D, /* GROUP SEPARATOR */
9000 0x001E, /* RECORD SEPARATOR */
9001 0x0085, /* NEXT LINE */
9002 0x2028, /* LINE SEPARATOR */
9003 0x2029, /* PARAGRAPH SEPARATOR */
9004 };
9005
Fred Drakee4315f52000-05-09 19:53:39 +00009006 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009007 free_list = NULL;
9008 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009010 if (!unicode_empty)
9011 return;
9012
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009013 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009014 for (i = 0; i < 256; i++)
9015 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009016 if (PyType_Ready(&PyUnicode_Type) < 0)
9017 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009018
9019 /* initialize the linebreak bloom filter */
9020 bloom_linebreak = make_bloom_mask(
9021 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9022 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009023
9024 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
9027/* Finalize the Unicode implementation */
9028
Christian Heimes3b718a72008-02-14 12:47:33 +00009029int
9030PyUnicode_ClearFreeList(void)
9031{
9032 int freelist_size = numfree;
9033 PyUnicodeObject *u;
9034
9035 for (u = free_list; u != NULL;) {
9036 PyUnicodeObject *v = u;
9037 u = *(PyUnicodeObject **)u;
9038 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00009039 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00009040 Py_XDECREF(v->defenc);
9041 PyObject_Del(v);
9042 numfree--;
9043 }
9044 free_list = NULL;
9045 assert(numfree == 0);
9046 return freelist_size;
9047}
9048
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049void
Thomas Wouters78890102000-07-22 19:25:51 +00009050_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009052 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009054 Py_XDECREF(unicode_empty);
9055 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009056
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009057 for (i = 0; i < 256; i++) {
9058 if (unicode_latin1[i]) {
9059 Py_DECREF(unicode_latin1[i]);
9060 unicode_latin1[i] = NULL;
9061 }
9062 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009063 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009065
Anthony Baxterac6bd462006-04-13 02:06:09 +00009066#ifdef __cplusplus
9067}
9068#endif
9069
9070
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009071/*
9072Local variables:
9073c-basic-offset: 4
9074indent-tabs-mode: nil
9075End:
9076*/