blob: db907d6a0e02b89b66e108cfd28f674eac370220 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000315 if (free_list) {
316 unicode = free_list;
317 free_list = *(PyUnicodeObject **)unicode;
318 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Keep-Alive optimization: we only upsize the buffer,
321 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000322 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000323 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000324 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000328 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000335 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 if (unicode == NULL)
338 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 }
342
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000343 if (!unicode->str) {
344 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000345 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000346 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000348 * the caller fails before initializing str -- unicode_resize()
349 * reads str[0], and the Keep-Alive optimization can keep memory
350 * allocated for str alive across a call to unicode_dealloc(unicode).
351 * We don't want unicode_resize to read uninitialized memory in
352 * that case.
353 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000354 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000358 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000359 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000361
362 onError:
363 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366}
367
368static
Guido van Rossum9475a232001-10-05 20:51:39 +0000369void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370{
Walter Dörwald16807132007-05-25 13:52:07 +0000371 switch (PyUnicode_CHECK_INTERNED(unicode)) {
372 case SSTATE_NOT_INTERNED:
373 break;
374
375 case SSTATE_INTERNED_MORTAL:
376 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000377 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
379 Py_FatalError(
380 "deletion of interned unicode string failed");
381 break;
382
383 case SSTATE_INTERNED_IMMORTAL:
384 Py_FatalError("Immortal interned unicode string died.");
385
386 default:
387 Py_FatalError("Inconsistent interned unicode string state.");
388 }
389
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000394 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 unicode->str = NULL;
396 unicode->length = 0;
397 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000398 if (unicode->defenc) {
399 Py_DECREF(unicode->defenc);
400 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000401 }
402 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000403 *(PyUnicodeObject **)unicode = free_list;
404 free_list = unicode;
405 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406 }
407 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000408 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000409 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000410 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411 }
412}
413
Martin v. Löwis18e16552006-02-15 17:27:45 +0000414int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000439 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 *unicode = (PyObject *)w;
441 return 0;
442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
449/* Internal API for use in unicodeobject.c only ! */
450#define _PyUnicode_Resize(unicodevar, length) \
451 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
452
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455{
456 PyUnicodeObject *unicode;
457
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
462 /* Optimization for empty strings */
463 if (size == 0 && unicode_empty != NULL) {
464 Py_INCREF(unicode_empty);
465 return (PyObject *)unicode_empty;
466 }
467
468 /* Single character Unicode objects in the Latin-1 range are
469 shared when using this constructor */
470 if (size == 1 && *u < 256) {
471 unicode = unicode_latin1[*u];
472 if (!unicode) {
473 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 if (!unicode)
475 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000476 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 unicode_latin1[*u] = unicode;
478 }
479 Py_INCREF(unicode);
480 return (PyObject *)unicode;
481 }
482 }
Tim Petersced69f82003-09-16 20:30:58 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 unicode = _PyUnicode_New(size);
485 if (!unicode)
486 return NULL;
487
488 /* Copy the Unicode data into the new object */
489 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491
492 return (PyObject *)unicode;
493}
494
Walter Dörwaldd2034312007-05-18 16:29:38 +0000495PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000496{
497 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000498
499 if (size < 0) {
500 PyErr_SetString(PyExc_SystemError,
501 "Negative size passed to PyUnicode_FromStringAndSize");
502 return NULL;
503 }
504
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000506 some optimizations which share commonly used objects.
507 Also, this means the input must be UTF-8, so fall back to the
508 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000509 if (u != NULL) {
510
511 /* Optimization for empty strings */
512 if (size == 0 && unicode_empty != NULL) {
513 Py_INCREF(unicode_empty);
514 return (PyObject *)unicode_empty;
515 }
516
Martin v. Löwis9c121062007-08-05 20:26:11 +0000517 /* Single characters are shared when using this constructor.
518 Restrict to ASCII, since the input must be UTF-8. */
519 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000520 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521 if (!unicode) {
522 unicode = _PyUnicode_New(1);
523 if (!unicode)
524 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000525 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000526 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 }
528 Py_INCREF(unicode);
529 return (PyObject *)unicode;
530 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531
532 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000533 }
534
Walter Dörwald55507312007-05-18 13:12:10 +0000535 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 if (!unicode)
537 return NULL;
538
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 return (PyObject *)unicode;
540}
541
Walter Dörwaldd2034312007-05-18 16:29:38 +0000542PyObject *PyUnicode_FromString(const char *u)
543{
544 size_t size = strlen(u);
545 if (size > PY_SSIZE_T_MAX) {
546 PyErr_SetString(PyExc_OverflowError, "input too long");
547 return NULL;
548 }
549
550 return PyUnicode_FromStringAndSize(u, size);
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553#ifdef HAVE_WCHAR_H
554
555PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000556 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000557{
558 PyUnicodeObject *unicode;
559
560 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000561 if (size == 0)
562 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
Martin v. Löwis790465f2008-04-05 20:41:37 +0000567 if (size == -1) {
568 size = wcslen(w);
569 }
570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 unicode = _PyUnicode_New(size);
572 if (!unicode)
573 return NULL;
574
575 /* Copy the wchar_t data into the new object */
576#ifdef HAVE_USABLE_WCHAR_T
577 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000578#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579 {
580 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000583 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 *u++ = *w++;
585 }
586#endif
587
588 return (PyObject *)unicode;
589}
590
Walter Dörwald346737f2007-05-31 10:44:43 +0000591static void
592makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
593{
594 *fmt++ = '%';
595 if (width) {
596 if (zeropad)
597 *fmt++ = '0';
598 fmt += sprintf(fmt, "%d", width);
599 }
600 if (precision)
601 fmt += sprintf(fmt, ".%d", precision);
602 if (longflag)
603 *fmt++ = 'l';
604 else if (size_tflag) {
605 char *f = PY_FORMAT_SIZE_T;
606 while (*f)
607 *fmt++ = *f++;
608 }
609 *fmt++ = c;
610 *fmt = '\0';
611}
612
Walter Dörwaldd2034312007-05-18 16:29:38 +0000613#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
614
615PyObject *
616PyUnicode_FromFormatV(const char *format, va_list vargs)
617{
618 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000619 Py_ssize_t callcount = 0;
620 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000621 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000623 int width = 0;
624 int precision = 0;
625 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 const char* f;
627 Py_UNICODE *s;
628 PyObject *string;
629 /* used by sprintf */
630 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000631 /* use abuffer instead of buffer, if we need more space
632 * (which can happen if there's a format specifier with width). */
633 char *abuffer = NULL;
634 char *realbuffer;
635 Py_ssize_t abuffersize = 0;
636 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000637 const char *copy;
638
639#ifdef VA_LIST_IS_ARRAY
640 Py_MEMCPY(count, vargs, sizeof(va_list));
641#else
642#ifdef __va_copy
643 __va_copy(count, vargs);
644#else
645 count = vargs;
646#endif
647#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000648 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000649 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000650 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000651 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000652 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000653 ++callcount;
654 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000655 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000656 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000657 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000658 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 if (!callresults) {
660 PyErr_NoMemory();
661 return NULL;
662 }
663 callresult = callresults;
664 }
665 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666 for (f = format; *f; f++) {
667 if (*f == '%') {
668 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000669 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000670 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000671 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000672 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000673 ;
674
675 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
676 * they don't affect the amount of space we reserve.
677 */
678 if ((*f == 'l' || *f == 'z') &&
679 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000680 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681
682 switch (*f) {
683 case 'c':
684 (void)va_arg(count, int);
685 /* fall through... */
686 case '%':
687 n++;
688 break;
689 case 'd': case 'u': case 'i': case 'x':
690 (void) va_arg(count, int);
691 /* 20 bytes is enough to hold a 64-bit
692 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000693 This isn't enough for octal.
694 If a width is specified we need more
695 (which we allocate later). */
696 if (width < 20)
697 width = 20;
698 n += width;
699 if (abuffersize < width)
700 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000701 break;
702 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000703 {
704 /* UTF-8 */
705 unsigned char*s;
706 s = va_arg(count, unsigned char*);
707 while (*s) {
708 if (*s < 128) {
709 n++; s++;
710 } else if (*s < 0xc0) {
711 /* invalid UTF-8 */
712 n++; s++;
713 } else if (*s < 0xc0) {
714 n++;
715 s++; if(!*s)break;
716 s++;
717 } else if (*s < 0xe0) {
718 n++;
719 s++; if(!*s)break;
720 s++; if(!*s)break;
721 s++;
722 } else {
723 #ifdef Py_UNICODE_WIDE
724 n++;
725 #else
726 n+=2;
727 #endif
728 s++; if(!*s)break;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 }
733 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000735 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 case 'U':
737 {
738 PyObject *obj = va_arg(count, PyObject *);
739 assert(obj && PyUnicode_Check(obj));
740 n += PyUnicode_GET_SIZE(obj);
741 break;
742 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000743 case 'V':
744 {
745 PyObject *obj = va_arg(count, PyObject *);
746 const char *str = va_arg(count, const char *);
747 assert(obj || str);
748 assert(!obj || PyUnicode_Check(obj));
749 if (obj)
750 n += PyUnicode_GET_SIZE(obj);
751 else
752 n += strlen(str);
753 break;
754 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000755 case 'S':
756 {
757 PyObject *obj = va_arg(count, PyObject *);
758 PyObject *str;
759 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000760 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000761 if (!str)
762 goto fail;
763 n += PyUnicode_GET_SIZE(str);
764 /* Remember the str and switch to the next slot */
765 *callresult++ = str;
766 break;
767 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000768 case 'R':
769 {
770 PyObject *obj = va_arg(count, PyObject *);
771 PyObject *repr;
772 assert(obj);
773 repr = PyObject_Repr(obj);
774 if (!repr)
775 goto fail;
776 n += PyUnicode_GET_SIZE(repr);
777 /* Remember the repr and switch to the next slot */
778 *callresult++ = repr;
779 break;
780 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000781 case 'p':
782 (void) va_arg(count, int);
783 /* maximum 64-bit pointer representation:
784 * 0xffffffffffffffff
785 * so 19 characters is enough.
786 * XXX I count 18 -- what's the extra for?
787 */
788 n += 19;
789 break;
790 default:
791 /* if we stumble upon an unknown
792 formatting code, copy the rest of
793 the format string to the output
794 string. (we cannot just skip the
795 code, since there's no way to know
796 what's in the argument list) */
797 n += strlen(p);
798 goto expand;
799 }
800 } else
801 n++;
802 }
803 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000805 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000806 if (!abuffer) {
807 PyErr_NoMemory();
808 goto fail;
809 }
810 realbuffer = abuffer;
811 }
812 else
813 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000815 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 we don't have to resize the string.
817 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 string = PyUnicode_FromUnicode(NULL, n);
819 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821
822 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000823 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000824
825 for (f = format; *f; f++) {
826 if (*f == '%') {
827 const char* p = f++;
828 int longflag = 0;
829 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000830 zeropad = (*f == '0');
831 /* parse the width.precision part */
832 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000833 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000834 width = (width*10) + *f++ - '0';
835 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 if (*f == '.') {
837 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000838 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000839 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000840 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 /* handle the long flag, but only for %ld and %lu.
842 others can be added when necessary. */
843 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
844 longflag = 1;
845 ++f;
846 }
847 /* handle the size_t flag. */
848 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
849 size_tflag = 1;
850 ++f;
851 }
852
853 switch (*f) {
854 case 'c':
855 *s++ = va_arg(vargs, int);
856 break;
857 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000858 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000864 sprintf(realbuffer, fmt, va_arg(vargs, int));
865 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000866 break;
867 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000868 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000870 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000873 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
875 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000876 break;
877 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
879 sprintf(realbuffer, fmt, va_arg(vargs, int));
880 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000881 break;
882 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
884 sprintf(realbuffer, fmt, va_arg(vargs, int));
885 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 break;
887 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000888 {
889 /* Parameter must be UTF-8 encoded.
890 In case of encoding errors, use
891 the replacement character. */
892 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000893 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000894 u = PyUnicode_DecodeUTF8(p, strlen(p),
895 "replace");
896 if (!u)
897 goto fail;
898 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
899 PyUnicode_GET_SIZE(u));
900 s += PyUnicode_GET_SIZE(u);
901 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000903 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 case 'U':
905 {
906 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000907 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
909 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000910 break;
911 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000912 case 'V':
913 {
914 PyObject *obj = va_arg(vargs, PyObject *);
915 const char *str = va_arg(vargs, const char *);
916 if (obj) {
917 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
918 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
919 s += size;
920 } else {
921 appendstring(str);
922 }
923 break;
924 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000925 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000926 case 'R':
927 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000928 Py_UNICODE *ucopy;
929 Py_ssize_t usize;
930 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000931 /* unused, since we already have the result */
932 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000933 ucopy = PyUnicode_AS_UNICODE(*callresult);
934 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000935 for (upos = 0; upos<usize;)
936 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000937 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000938 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000939 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000940 ++callresult;
941 break;
942 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000943 case 'p':
944 sprintf(buffer, "%p", va_arg(vargs, void*));
945 /* %p is ill-defined: ensure leading 0x. */
946 if (buffer[1] == 'X')
947 buffer[1] = 'x';
948 else if (buffer[1] != 'x') {
949 memmove(buffer+2, buffer, strlen(buffer)+1);
950 buffer[0] = '0';
951 buffer[1] = 'x';
952 }
953 appendstring(buffer);
954 break;
955 case '%':
956 *s++ = '%';
957 break;
958 default:
959 appendstring(p);
960 goto end;
961 }
962 } else
963 *s++ = *f;
964 }
965
966 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000967 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000968 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000969 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000970 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
972 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000973 fail:
974 if (callresults) {
975 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000976 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000977 Py_DECREF(*callresult2);
978 ++callresult2;
979 }
Christian Heimesb186d002008-03-18 15:15:01 +0000980 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000981 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000982 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000983 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000984 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000985}
986
987#undef appendstring
988
989PyObject *
990PyUnicode_FromFormat(const char *format, ...)
991{
992 PyObject* ret;
993 va_list vargs;
994
995#ifdef HAVE_STDARG_PROTOTYPES
996 va_start(vargs, format);
997#else
998 va_start(vargs);
999#endif
1000 ret = PyUnicode_FromFormatV(format, vargs);
1001 va_end(vargs);
1002 return ret;
1003}
1004
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1006 wchar_t *w,
1007 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008{
1009 if (unicode == NULL) {
1010 PyErr_BadInternalCall();
1011 return -1;
1012 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001013
1014 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001016 size = PyUnicode_GET_SIZE(unicode) + 1;
1017
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018#ifdef HAVE_USABLE_WCHAR_T
1019 memcpy(w, unicode->str, size * sizeof(wchar_t));
1020#else
1021 {
1022 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001023 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001025 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 *w++ = *u++;
1027 }
1028#endif
1029
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030 if (size > PyUnicode_GET_SIZE(unicode))
1031 return PyUnicode_GET_SIZE(unicode);
1032 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return size;
1034}
1035
1036#endif
1037
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001038PyObject *PyUnicode_FromOrdinal(int ordinal)
1039{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001040 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001041
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001042 if (ordinal < 0 || ordinal > 0x10ffff) {
1043 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001044 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001045 return NULL;
1046 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001047
1048#ifndef Py_UNICODE_WIDE
1049 if (ordinal > 0xffff) {
1050 ordinal -= 0x10000;
1051 s[0] = 0xD800 | (ordinal >> 10);
1052 s[1] = 0xDC00 | (ordinal & 0x3FF);
1053 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054 }
1055#endif
1056
Hye-Shik Chang40574832004-04-06 07:24:51 +00001057 s[0] = (Py_UNICODE)ordinal;
1058 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059}
1060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061PyObject *PyUnicode_FromObject(register PyObject *obj)
1062{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001064 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 if (PyUnicode_CheckExact(obj)) {
1066 Py_INCREF(obj);
1067 return obj;
1068 }
1069 if (PyUnicode_Check(obj)) {
1070 /* For a Unicode subtype that's not a Unicode object,
1071 return a true Unicode object with the same data. */
1072 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1073 PyUnicode_GET_SIZE(obj));
1074 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001075 PyErr_Format(PyExc_TypeError,
1076 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001077 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001078 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001079}
1080
1081PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1082 const char *encoding,
1083 const char *errors)
1084{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001085 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 if (obj == NULL) {
1090 PyErr_BadInternalCall();
1091 return NULL;
1092 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094 if (PyUnicode_Check(obj)) {
1095 PyErr_SetString(PyExc_TypeError,
1096 "decoding Unicode is not supported");
1097 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099
1100 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001101 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001102 s = PyBytes_AS_STRING(obj);
1103 len = PyBytes_GET_SIZE(obj);
1104 }
1105 else if (PyByteArray_Check(obj)) {
1106 s = PyByteArray_AS_STRING(obj);
1107 len = PyByteArray_GET_SIZE(obj);
1108 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001109 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1110 /* Overwrite the error message with something more useful in
1111 case of a TypeError. */
1112 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001113 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001114 "coercing to Unicode: need string or buffer, "
1115 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001116 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001117 goto onError;
1118 }
Tim Petersced69f82003-09-16 20:30:58 +00001119
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001120 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 if (len == 0) {
1122 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001123 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 }
Tim Petersced69f82003-09-16 20:30:58 +00001125 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001126 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001127
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001128 return v;
1129
1130 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132}
1133
1134PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001135 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 const char *encoding,
1137 const char *errors)
1138{
1139 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001140 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001141 char lower[20]; /* Enough for any encoding name we recognize */
1142 char *l;
1143 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144
1145 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001146 encoding = PyUnicode_GetDefaultEncoding();
1147
1148 /* Convert encoding to lower case and replace '_' with '-' in order to
1149 catch e.g. UTF_8 */
1150 e = encoding;
1151 l = lower;
1152 while (*e && l < &lower[(sizeof lower) - 2]) {
1153 if (ISUPPER(*e)) {
1154 *l++ = TOLOWER(*e++);
1155 }
1156 else if (*e == '_') {
1157 *l++ = '-';
1158 e++;
1159 }
1160 else {
1161 *l++ = *e++;
1162 }
1163 }
1164 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001165
1166 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 else if ((strcmp(lower, "latin-1") == 0) ||
1170 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001171 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001172#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001173 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001174 return PyUnicode_DecodeMBCS(s, size, errors);
1175#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001176 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001177 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001178 else if (strcmp(lower, "utf-16") == 0)
1179 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1180 else if (strcmp(lower, "utf-32") == 0)
1181 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001184 buffer = NULL;
1185 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1186 goto onError;
1187 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 if (buffer == NULL)
1189 goto onError;
1190 unicode = PyCodec_Decode(buffer, encoding, errors);
1191 if (unicode == NULL)
1192 goto onError;
1193 if (!PyUnicode_Check(unicode)) {
1194 PyErr_Format(PyExc_TypeError,
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001195 "decoder did not return a unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001196 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 Py_DECREF(unicode);
1198 goto onError;
1199 }
1200 Py_DECREF(buffer);
1201 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001202
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 onError:
1204 Py_XDECREF(buffer);
1205 return NULL;
1206}
1207
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001208PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1211{
1212 PyObject *v;
1213
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1217 }
1218
1219 if (encoding == NULL)
1220 encoding = PyUnicode_GetDefaultEncoding();
1221
1222 /* Decode via the codec registry */
1223 v = PyCodec_Decode(unicode, encoding, errors);
1224 if (v == NULL)
1225 goto onError;
1226 return v;
1227
1228 onError:
1229 return NULL;
1230}
1231
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001232PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1235{
1236 PyObject *v;
1237
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
1242
1243 if (encoding == NULL)
1244 encoding = PyUnicode_GetDefaultEncoding();
1245
1246 /* Decode via the codec registry */
1247 v = PyCodec_Decode(unicode, encoding, errors);
1248 if (v == NULL)
1249 goto onError;
1250 if (!PyUnicode_Check(v)) {
1251 PyErr_Format(PyExc_TypeError,
1252 "decoder did not return a unicode object (type=%.400s)",
1253 Py_TYPE(v)->tp_name);
1254 Py_DECREF(v);
1255 goto onError;
1256 }
1257 return v;
1258
1259 onError:
1260 return NULL;
1261}
1262
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001264 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 const char *encoding,
1266 const char *errors)
1267{
1268 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001269
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270 unicode = PyUnicode_FromUnicode(s, size);
1271 if (unicode == NULL)
1272 return NULL;
1273 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1274 Py_DECREF(unicode);
1275 return v;
1276}
1277
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001278PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1279 const char *encoding,
1280 const char *errors)
1281{
1282 PyObject *v;
1283
1284 if (!PyUnicode_Check(unicode)) {
1285 PyErr_BadArgument();
1286 goto onError;
1287 }
1288
1289 if (encoding == NULL)
1290 encoding = PyUnicode_GetDefaultEncoding();
1291
1292 /* Encode via the codec registry */
1293 v = PyCodec_Encode(unicode, encoding, errors);
1294 if (v == NULL)
1295 goto onError;
1296 return v;
1297
1298 onError:
1299 return NULL;
1300}
1301
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1303 const char *encoding,
1304 const char *errors)
1305{
1306 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001307
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 if (!PyUnicode_Check(unicode)) {
1309 PyErr_BadArgument();
1310 goto onError;
1311 }
Fred Drakee4315f52000-05-09 19:53:39 +00001312
Tim Petersced69f82003-09-16 20:30:58 +00001313 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001314 encoding = PyUnicode_GetDefaultEncoding();
1315
1316 /* Shortcuts for common default encodings */
1317 if (errors == NULL) {
1318 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001319 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001320 else if (strcmp(encoding, "latin-1") == 0)
1321 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001322#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1323 else if (strcmp(encoding, "mbcs") == 0)
1324 return PyUnicode_AsMBCSString(unicode);
1325#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001326 else if (strcmp(encoding, "ascii") == 0)
1327 return PyUnicode_AsASCIIString(unicode);
1328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329
1330 /* Encode via the codec registry */
1331 v = PyCodec_Encode(unicode, encoding, errors);
1332 if (v == NULL)
1333 goto onError;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001334 if (PyByteArray_Check(v)) {
1335 char msg[100];
1336 PyOS_snprintf(msg, sizeof(msg),
1337 "encoder %s returned buffer instead of bytes",
1338 encoding);
1339 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1340 v = NULL;
1341 goto onError;
1342 }
1343 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1344 }
1345 else if (!PyBytes_Check(v)) {
1346 PyErr_Format(PyExc_TypeError,
1347 "encoder did not return a bytes object (type=%.400s)",
1348 Py_TYPE(v)->tp_name);
1349 v = NULL;
1350 }
1351 return v;
1352
1353 onError:
1354 return NULL;
1355}
1356
1357PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1358 const char *encoding,
1359 const char *errors)
1360{
1361 PyObject *v;
1362
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367
1368 if (encoding == NULL)
1369 encoding = PyUnicode_GetDefaultEncoding();
1370
1371 /* Encode via the codec registry */
1372 v = PyCodec_Encode(unicode, encoding, errors);
1373 if (v == NULL)
1374 goto onError;
1375 if (!PyUnicode_Check(v)) {
1376 PyErr_Format(PyExc_TypeError,
1377 "encoder did not return an unicode object (type=%.400s)",
1378 Py_TYPE(v)->tp_name);
1379 Py_DECREF(v);
1380 goto onError;
1381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 onError:
1385 return NULL;
1386}
1387
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1389 const char *errors)
1390{
1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001392 if (v)
1393 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001394 if (errors != NULL)
1395 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001396 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001397 PyUnicode_GET_SIZE(unicode),
1398 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001399 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001400 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001401 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001402 return v;
1403}
1404
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001405PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001406PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001407 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001408 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1409}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001410
Christian Heimes5894ba72007-11-04 11:43:14 +00001411PyObject*
1412PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1413{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001414 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1415 can be undefined. If it is case, decode using UTF-8. The following assumes
1416 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1417 bootstrapping process where the codecs aren't ready yet.
1418 */
1419 if (Py_FileSystemDefaultEncoding) {
1420#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001421 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001422 return PyUnicode_DecodeMBCS(s, size, "replace");
1423 }
1424#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001425 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001426 return PyUnicode_DecodeUTF8(s, size, "replace");
1427 }
1428#endif
1429 return PyUnicode_Decode(s, size,
1430 Py_FileSystemDefaultEncoding,
1431 "replace");
1432 }
1433 else {
1434 return PyUnicode_DecodeUTF8(s, size, "replace");
1435 }
1436}
1437
Martin v. Löwis5b222132007-06-10 09:51:05 +00001438char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001439PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001440{
Christian Heimesf3863112007-11-22 07:46:41 +00001441 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001442 if (!PyUnicode_Check(unicode)) {
1443 PyErr_BadArgument();
1444 return NULL;
1445 }
Christian Heimesf3863112007-11-22 07:46:41 +00001446 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1447 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001448 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001449 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001450 *psize = PyBytes_GET_SIZE(bytes);
1451 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001452}
1453
1454char*
1455PyUnicode_AsString(PyObject *unicode)
1456{
1457 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001458}
1459
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1461{
1462 if (!PyUnicode_Check(unicode)) {
1463 PyErr_BadArgument();
1464 goto onError;
1465 }
1466 return PyUnicode_AS_UNICODE(unicode);
1467
1468 onError:
1469 return NULL;
1470}
1471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473{
1474 if (!PyUnicode_Check(unicode)) {
1475 PyErr_BadArgument();
1476 goto onError;
1477 }
1478 return PyUnicode_GET_SIZE(unicode);
1479
1480 onError:
1481 return -1;
1482}
1483
Thomas Wouters78890102000-07-22 19:25:51 +00001484const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001485{
1486 return unicode_default_encoding;
1487}
1488
1489int PyUnicode_SetDefaultEncoding(const char *encoding)
1490{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001491 if (strcmp(encoding, unicode_default_encoding) != 0) {
1492 PyErr_Format(PyExc_ValueError,
1493 "Can only set default encoding to %s",
1494 unicode_default_encoding);
1495 return -1;
1496 }
Fred Drakee4315f52000-05-09 19:53:39 +00001497 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001498}
1499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500/* error handling callback helper:
1501 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001502 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 and adjust various state variables.
1504 return 0 on success, -1 on error
1505*/
1506
1507static
1508int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1509 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001510 const char **input, const char **inend, Py_ssize_t *startinpos,
1511 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001512 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001515
1516 PyObject *restuple = NULL;
1517 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001518 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001519 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t requiredsize;
1521 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001523 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 int res = -1;
1526
1527 if (*errorHandler == NULL) {
1528 *errorHandler = PyCodec_LookupError(errors);
1529 if (*errorHandler == NULL)
1530 goto onError;
1531 }
1532
1533 if (*exceptionObject == NULL) {
1534 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 if (*exceptionObject == NULL)
1537 goto onError;
1538 }
1539 else {
1540 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1541 goto onError;
1542 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1543 goto onError;
1544 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1545 goto onError;
1546 }
1547
1548 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1549 if (restuple == NULL)
1550 goto onError;
1551 if (!PyTuple_Check(restuple)) {
1552 PyErr_Format(PyExc_TypeError, &argparse[4]);
1553 goto onError;
1554 }
1555 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1556 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001557
1558 /* Copy back the bytes variables, which might have been modified by the
1559 callback */
1560 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1561 if (!inputobj)
1562 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001563 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001564 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1565 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001566 *input = PyBytes_AS_STRING(inputobj);
1567 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001568 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001569 /* we can DECREF safely, as the exception has another reference,
1570 so the object won't go away. */
1571 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001573 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001574 newpos = insize+newpos;
1575 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001576 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001577 goto onError;
1578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579
1580 /* need more space? (at least enough for what we
1581 have+the replacement+the rest of the string (starting
1582 at the new input position), so we won't have to check space
1583 when there are no errors in the rest of the string) */
1584 repptr = PyUnicode_AS_UNICODE(repunicode);
1585 repsize = PyUnicode_GET_SIZE(repunicode);
1586 requiredsize = *outpos + repsize + insize-newpos;
1587 if (requiredsize > outsize) {
1588 if (requiredsize<2*outsize)
1589 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001590 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001591 goto onError;
1592 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1593 }
1594 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001595 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 Py_UNICODE_COPY(*outptr, repptr, repsize);
1597 *outptr += repsize;
1598 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 /* we made it! */
1601 res = 0;
1602
1603 onError:
1604 Py_XDECREF(restuple);
1605 return res;
1606}
1607
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608/* --- UTF-7 Codec -------------------------------------------------------- */
1609
1610/* see RFC2152 for details */
1611
Tim Petersced69f82003-09-16 20:30:58 +00001612static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613char utf7_special[128] = {
1614 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1615 encoded:
1616 0 - not special
1617 1 - special
1618 2 - whitespace (optional)
1619 3 - RFC2152 Set O (optional) */
1620 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1622 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1624 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1626 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1628
1629};
1630
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001631/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1632 warnings about the comparison always being false; since
1633 utf7_special[0] is 1, we can safely make that one comparison
1634 true */
1635
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001637 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001638 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001639 (encodeO && (utf7_special[(c)] == 3)))
1640
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001641#define B64(n) \
1642 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1643#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001644 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001645#define UB64(c) \
1646 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1647 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001649#define ENCODE(out, ch, bits) \
1650 while (bits >= 6) { \
1651 *out++ = B64(ch >> (bits-6)); \
1652 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
1654
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001655#define DECODE(out, ch, bits, surrogate) \
1656 while (bits >= 16) { \
1657 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1658 bits -= 16; \
1659 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001660 /* We have already generated an error for the high surrogate \
1661 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001662 surrogate = 0; \
1663 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001665 it in a 16-bit character */ \
1666 surrogate = 1; \
1667 errmsg = "code pairs are not supported"; \
1668 goto utf7Error; \
1669 } else { \
1670 *out++ = outCh; \
1671 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001675 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 const char *errors)
1677{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001678 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1679}
1680
1681PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1682 Py_ssize_t size,
1683 const char *errors,
1684 Py_ssize_t *consumed)
1685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001687 Py_ssize_t startinpos;
1688 Py_ssize_t endinpos;
1689 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690 const char *e;
1691 PyUnicodeObject *unicode;
1692 Py_UNICODE *p;
1693 const char *errmsg = "";
1694 int inShift = 0;
1695 unsigned int bitsleft = 0;
1696 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 int surrogate = 0;
1698 PyObject *errorHandler = NULL;
1699 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700
1701 unicode = _PyUnicode_New(size);
1702 if (!unicode)
1703 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001704 if (size == 0) {
1705 if (consumed)
1706 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709
1710 p = unicode->str;
1711 e = s + size;
1712
1713 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 Py_UNICODE ch;
1715 restart:
1716 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717
1718 if (inShift) {
1719 if ((ch == '-') || !B64CHAR(ch)) {
1720 inShift = 0;
1721 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001722
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1724 if (bitsleft >= 6) {
1725 /* The shift sequence has a partial character in it. If
1726 bitsleft < 6 then we could just classify it as padding
1727 but that is not the case here */
1728
1729 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001730 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 }
1732 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001733 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 here so indicate the potential of a misencoded character. */
1735
1736 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1737 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1738 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001739 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 }
1741
1742 if (ch == '-') {
1743 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001744 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 inShift = 1;
1746 }
1747 } else if (SPECIAL(ch,0,0)) {
1748 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001749 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 } else {
1751 *p++ = ch;
1752 }
1753 } else {
1754 charsleft = (charsleft << 6) | UB64(ch);
1755 bitsleft += 6;
1756 s++;
1757 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1758 }
1759 }
1760 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001762 s++;
1763 if (s < e && *s == '-') {
1764 s++;
1765 *p++ = '+';
1766 } else
1767 {
1768 inShift = 1;
1769 bitsleft = 0;
1770 }
1771 }
1772 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001773 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001774 errmsg = "unexpected special character";
1775 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001776 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 }
1778 else {
1779 *p++ = ch;
1780 s++;
1781 }
1782 continue;
1783 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 outpos = p-PyUnicode_AS_UNICODE(unicode);
1785 endinpos = s-starts;
1786 if (unicode_decode_call_errorhandler(
1787 errors, &errorHandler,
1788 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001789 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 (PyObject **)&unicode, &outpos, &p))
1791 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 }
1793
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001794 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 outpos = p-PyUnicode_AS_UNICODE(unicode);
1796 endinpos = size;
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001800 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 if (s < e)
1804 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001806 if (consumed) {
1807 if(inShift)
1808 *consumed = startinpos;
1809 else
1810 *consumed = s-starts;
1811 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001812
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001813 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814 goto onError;
1815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 Py_XDECREF(errorHandler);
1817 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 return (PyObject *)unicode;
1819
1820onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 Py_XDECREF(errorHandler);
1822 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823 Py_DECREF(unicode);
1824 return NULL;
1825}
1826
1827
1828PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001829 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 int encodeSetO,
1831 int encodeWhiteSpace,
1832 const char *errors)
1833{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001834 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001838 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 unsigned int bitsleft = 0;
1840 unsigned long charsleft = 0;
1841 char * out;
1842 char * start;
1843
1844 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001845 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846
Christian Heimes9c4756e2008-05-26 13:22:05 +00001847 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848 if (v == NULL)
1849 return NULL;
1850
Christian Heimes9c4756e2008-05-26 13:22:05 +00001851 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852 for (;i < size; ++i) {
1853 Py_UNICODE ch = s[i];
1854
1855 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001856 if (ch == '+') {
1857 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 *out++ = '-';
1859 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1860 charsleft = ch;
1861 bitsleft = 16;
1862 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001863 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001865 } else {
1866 *out++ = (char) ch;
1867 }
1868 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1870 *out++ = B64(charsleft << (6-bitsleft));
1871 charsleft = 0;
1872 bitsleft = 0;
1873 /* Characters not in the BASE64 set implicitly unshift the sequence
1874 so no '-' is required, except if the character is itself a '-' */
1875 if (B64CHAR(ch) || ch == '-') {
1876 *out++ = '-';
1877 }
1878 inShift = 0;
1879 *out++ = (char) ch;
1880 } else {
1881 bitsleft += 16;
1882 charsleft = (charsleft << 16) | ch;
1883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1884
1885 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001886 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887 or '-' then the shift sequence will be terminated implicitly and we
1888 don't have to insert a '-'. */
1889
1890 if (bitsleft == 0) {
1891 if (i + 1 < size) {
1892 Py_UNICODE ch2 = s[i+1];
1893
1894 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001895
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001896 } else if (B64CHAR(ch2) || ch2 == '-') {
1897 *out++ = '-';
1898 inShift = 0;
1899 } else {
1900 inShift = 0;
1901 }
1902
1903 }
1904 else {
1905 *out++ = '-';
1906 inShift = 0;
1907 }
1908 }
Tim Petersced69f82003-09-16 20:30:58 +00001909 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001910 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001911 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 if (bitsleft) {
1913 *out++= B64(charsleft << (6-bitsleft) );
1914 *out++ = '-';
1915 }
1916
Christian Heimes72b710a2008-05-26 13:28:38 +00001917 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001918 Py_DECREF(v);
1919 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920}
1921
1922#undef SPECIAL
1923#undef B64
1924#undef B64CHAR
1925#undef UB64
1926#undef ENCODE
1927#undef DECODE
1928
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929/* --- UTF-8 Codec -------------------------------------------------------- */
1930
Tim Petersced69f82003-09-16 20:30:58 +00001931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932char utf8_code_length[256] = {
1933 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1934 illegal prefix. see RFC 2279 for details */
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1947 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1948 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1949 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1950 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1951};
1952
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001954 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 const char *errors)
1956{
Walter Dörwald69652032004-09-07 20:24:22 +00001957 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1958}
1959
1960PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001961 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001962 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001963 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001964{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t startinpos;
1968 Py_ssize_t endinpos;
1969 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 const char *e;
1971 PyUnicodeObject *unicode;
1972 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001973 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001974 PyObject *errorHandler = NULL;
1975 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
1977 /* Note: size will always be longer than the resulting Unicode
1978 character count */
1979 unicode = _PyUnicode_New(size);
1980 if (!unicode)
1981 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001982 if (size == 0) {
1983 if (consumed)
1984 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987
1988 /* Unpack UTF-8 encoded data */
1989 p = unicode->str;
1990 e = s + size;
1991
1992 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001993 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001996 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 s++;
1998 continue;
1999 }
2000
2001 n = utf8_code_length[ch];
2002
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002003 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002004 if (consumed)
2005 break;
2006 else {
2007 errmsg = "unexpected end of data";
2008 startinpos = s-starts;
2009 endinpos = size;
2010 goto utf8Error;
2011 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013
2014 switch (n) {
2015
2016 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002017 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 startinpos = s-starts;
2019 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002020 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002023 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 startinpos = s-starts;
2025 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002026 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
2028 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002029 if ((s[1] & 0xc0) != 0x80) {
2030 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 startinpos = s-starts;
2032 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 goto utf8Error;
2034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002036 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 startinpos = s-starts;
2038 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002039 errmsg = "illegal encoding";
2040 goto utf8Error;
2041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002043 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 break;
2045
2046 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002047 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002048 (s[2] & 0xc0) != 0x80) {
2049 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 startinpos = s-starts;
2051 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002052 goto utf8Error;
2053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002055 if (ch < 0x0800) {
2056 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002057 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002058
2059 XXX For wide builds (UCS-4) we should probably try
2060 to recombine the surrogates into a single code
2061 unit.
2062 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002063 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 startinpos = s-starts;
2065 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002066 goto utf8Error;
2067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002069 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002070 break;
2071
2072 case 4:
2073 if ((s[1] & 0xc0) != 0x80 ||
2074 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002075 (s[3] & 0xc0) != 0x80) {
2076 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 startinpos = s-starts;
2078 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002079 goto utf8Error;
2080 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002081 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2082 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2083 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002084 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002086 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002087 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002089 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 startinpos = s-starts;
2091 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002092 goto utf8Error;
2093 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002094#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002095 *p++ = (Py_UNICODE)ch;
2096#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002098
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002099 /* translate from 10000..10FFFF to 0..FFFF */
2100 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002101
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002102 /* high surrogate = top 10 bits added to D800 */
2103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002104
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002105 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002107#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 break;
2109
2110 default:
2111 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002112 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 startinpos = s-starts;
2114 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002115 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
2117 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002119
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002120 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 outpos = p-PyUnicode_AS_UNICODE(unicode);
2122 if (unicode_decode_call_errorhandler(
2123 errors, &errorHandler,
2124 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002125 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 (PyObject **)&unicode, &outpos, &p))
2127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 }
Walter Dörwald69652032004-09-07 20:24:22 +00002129 if (consumed)
2130 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131
2132 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002133 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 goto onError;
2135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002136 Py_XDECREF(errorHandler);
2137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 return (PyObject *)unicode;
2139
2140onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002141 Py_XDECREF(errorHandler);
2142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 Py_DECREF(unicode);
2144 return NULL;
2145}
2146
Tim Peters602f7402002-04-27 18:03:26 +00002147/* Allocation strategy: if the string is short, convert into a stack buffer
2148 and allocate exactly as much space needed at the end. Else allocate the
2149 maximum possible needed (4 result bytes per Unicode character), and return
2150 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002151*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002152PyObject *
2153PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002154 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002155 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
Tim Peters602f7402002-04-27 18:03:26 +00002157#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002158
Guido van Rossum98297ee2007-11-06 21:34:58 +00002159 Py_ssize_t i; /* index into s of next input byte */
2160 PyObject *result; /* result string object */
2161 char *p; /* next free byte in output buffer */
2162 Py_ssize_t nallocated; /* number of result bytes allocated */
2163 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002164 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002165
Tim Peters602f7402002-04-27 18:03:26 +00002166 assert(s != NULL);
2167 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168
Tim Peters602f7402002-04-27 18:03:26 +00002169 if (size <= MAX_SHORT_UNICHARS) {
2170 /* Write into the stack buffer; nallocated can't overflow.
2171 * At the end, we'll allocate exactly as much heap space as it
2172 * turns out we need.
2173 */
2174 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002175 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002176 p = stackbuf;
2177 }
2178 else {
2179 /* Overallocate on the heap, and give the excess back at the end. */
2180 nallocated = size * 4;
2181 if (nallocated / 4 != size) /* overflow! */
2182 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002183 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002184 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002185 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002186 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002187 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002188
Tim Peters602f7402002-04-27 18:03:26 +00002189 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002190 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002191
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002192 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002193 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002195
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002197 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002198 *p++ = (char)(0xc0 | (ch >> 6));
2199 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002200 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002201 else {
Tim Peters602f7402002-04-27 18:03:26 +00002202 /* Encode UCS2 Unicode ordinals */
2203 if (ch < 0x10000) {
2204 /* Special case: check for high surrogate */
2205 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2206 Py_UCS4 ch2 = s[i];
2207 /* Check for low surrogate and combine the two to
2208 form a UCS4 value */
2209 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002210 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002211 i++;
2212 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002213 }
Tim Peters602f7402002-04-27 18:03:26 +00002214 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002216 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002217 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2218 *p++ = (char)(0x80 | (ch & 0x3f));
2219 continue;
2220 }
2221encodeUCS4:
2222 /* Encode UCS4 Unicode ordinals */
2223 *p++ = (char)(0xf0 | (ch >> 18));
2224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2225 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2226 *p++ = (char)(0x80 | (ch & 0x3f));
2227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002229
Guido van Rossum98297ee2007-11-06 21:34:58 +00002230 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002231 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002232 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002233 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002234 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002235 }
2236 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002237 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002238 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002239 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002240 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002241 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002242 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002243
Tim Peters602f7402002-04-27 18:03:26 +00002244#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245}
2246
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2248{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 if (!PyUnicode_Check(unicode)) {
2250 PyErr_BadArgument();
2251 return NULL;
2252 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2254 PyUnicode_GET_SIZE(unicode),
2255 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256}
2257
Walter Dörwald41980ca2007-08-16 21:55:45 +00002258/* --- UTF-32 Codec ------------------------------------------------------- */
2259
2260PyObject *
2261PyUnicode_DecodeUTF32(const char *s,
2262 Py_ssize_t size,
2263 const char *errors,
2264 int *byteorder)
2265{
2266 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2267}
2268
2269PyObject *
2270PyUnicode_DecodeUTF32Stateful(const char *s,
2271 Py_ssize_t size,
2272 const char *errors,
2273 int *byteorder,
2274 Py_ssize_t *consumed)
2275{
2276 const char *starts = s;
2277 Py_ssize_t startinpos;
2278 Py_ssize_t endinpos;
2279 Py_ssize_t outpos;
2280 PyUnicodeObject *unicode;
2281 Py_UNICODE *p;
2282#ifndef Py_UNICODE_WIDE
2283 int i, pairs;
2284#else
2285 const int pairs = 0;
2286#endif
2287 const unsigned char *q, *e;
2288 int bo = 0; /* assume native ordering by default */
2289 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002290 /* Offsets from q for retrieving bytes in the right order. */
2291#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292 int iorder[] = {0, 1, 2, 3};
2293#else
2294 int iorder[] = {3, 2, 1, 0};
2295#endif
2296 PyObject *errorHandler = NULL;
2297 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002298 /* On narrow builds we split characters outside the BMP into two
2299 codepoints => count how much extra space we need. */
2300#ifndef Py_UNICODE_WIDE
2301 for (i = pairs = 0; i < size/4; i++)
2302 if (((Py_UCS4 *)s)[i] >= 0x10000)
2303 pairs++;
2304#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002305
2306 /* This might be one to much, because of a BOM */
2307 unicode = _PyUnicode_New((size+3)/4+pairs);
2308 if (!unicode)
2309 return NULL;
2310 if (size == 0)
2311 return (PyObject *)unicode;
2312
2313 /* Unpack UTF-32 encoded data */
2314 p = unicode->str;
2315 q = (unsigned char *)s;
2316 e = q + size;
2317
2318 if (byteorder)
2319 bo = *byteorder;
2320
2321 /* Check for BOM marks (U+FEFF) in the input and adjust current
2322 byte order setting accordingly. In native mode, the leading BOM
2323 mark is skipped, in all other modes, it is copied to the output
2324 stream as-is (giving a ZWNBSP character). */
2325 if (bo == 0) {
2326 if (size >= 4) {
2327 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2328 (q[iorder[1]] << 8) | q[iorder[0]];
2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2330 if (bom == 0x0000FEFF) {
2331 q += 4;
2332 bo = -1;
2333 }
2334 else if (bom == 0xFFFE0000) {
2335 q += 4;
2336 bo = 1;
2337 }
2338#else
2339 if (bom == 0x0000FEFF) {
2340 q += 4;
2341 bo = 1;
2342 }
2343 else if (bom == 0xFFFE0000) {
2344 q += 4;
2345 bo = -1;
2346 }
2347#endif
2348 }
2349 }
2350
2351 if (bo == -1) {
2352 /* force LE */
2353 iorder[0] = 0;
2354 iorder[1] = 1;
2355 iorder[2] = 2;
2356 iorder[3] = 3;
2357 }
2358 else if (bo == 1) {
2359 /* force BE */
2360 iorder[0] = 3;
2361 iorder[1] = 2;
2362 iorder[2] = 1;
2363 iorder[3] = 0;
2364 }
2365
2366 while (q < e) {
2367 Py_UCS4 ch;
2368 /* remaining bytes at the end? (size should be divisible by 4) */
2369 if (e-q<4) {
2370 if (consumed)
2371 break;
2372 errmsg = "truncated data";
2373 startinpos = ((const char *)q)-starts;
2374 endinpos = ((const char *)e)-starts;
2375 goto utf32Error;
2376 /* The remaining input chars are ignored if the callback
2377 chooses to skip the input */
2378 }
2379 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2380 (q[iorder[1]] << 8) | q[iorder[0]];
2381
2382 if (ch >= 0x110000)
2383 {
2384 errmsg = "codepoint not in range(0x110000)";
2385 startinpos = ((const char *)q)-starts;
2386 endinpos = startinpos+4;
2387 goto utf32Error;
2388 }
2389#ifndef Py_UNICODE_WIDE
2390 if (ch >= 0x10000)
2391 {
2392 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2393 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2394 }
2395 else
2396#endif
2397 *p++ = ch;
2398 q += 4;
2399 continue;
2400 utf32Error:
2401 outpos = p-PyUnicode_AS_UNICODE(unicode);
2402 if (unicode_decode_call_errorhandler(
2403 errors, &errorHandler,
2404 "utf32", errmsg,
2405 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2406 (PyObject **)&unicode, &outpos, &p))
2407 goto onError;
2408 }
2409
2410 if (byteorder)
2411 *byteorder = bo;
2412
2413 if (consumed)
2414 *consumed = (const char *)q-starts;
2415
2416 /* Adjust length */
2417 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2418 goto onError;
2419
2420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
2422 return (PyObject *)unicode;
2423
2424onError:
2425 Py_DECREF(unicode);
2426 Py_XDECREF(errorHandler);
2427 Py_XDECREF(exc);
2428 return NULL;
2429}
2430
2431PyObject *
2432PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2433 Py_ssize_t size,
2434 const char *errors,
2435 int byteorder)
2436{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002437 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002438 unsigned char *p;
2439#ifndef Py_UNICODE_WIDE
2440 int i, pairs;
2441#else
2442 const int pairs = 0;
2443#endif
2444 /* Offsets from p for storing byte pairs in the right order. */
2445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2446 int iorder[] = {0, 1, 2, 3};
2447#else
2448 int iorder[] = {3, 2, 1, 0};
2449#endif
2450
2451#define STORECHAR(CH) \
2452 do { \
2453 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2454 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2455 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2456 p[iorder[0]] = (CH) & 0xff; \
2457 p += 4; \
2458 } while(0)
2459
2460 /* In narrow builds we can output surrogate pairs as one codepoint,
2461 so we need less space. */
2462#ifndef Py_UNICODE_WIDE
2463 for (i = pairs = 0; i < size-1; i++)
2464 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2465 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2466 pairs++;
2467#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002468 v = PyByteArray_FromStringAndSize(NULL,
Walter Dörwald41980ca2007-08-16 21:55:45 +00002469 4 * (size - pairs + (byteorder == 0)));
2470 if (v == NULL)
2471 return NULL;
2472
Christian Heimes9c4756e2008-05-26 13:22:05 +00002473 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002474 if (byteorder == 0)
2475 STORECHAR(0xFEFF);
2476 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002477 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002478
2479 if (byteorder == -1) {
2480 /* force LE */
2481 iorder[0] = 0;
2482 iorder[1] = 1;
2483 iorder[2] = 2;
2484 iorder[3] = 3;
2485 }
2486 else if (byteorder == 1) {
2487 /* force BE */
2488 iorder[0] = 3;
2489 iorder[1] = 2;
2490 iorder[2] = 1;
2491 iorder[3] = 0;
2492 }
2493
2494 while (size-- > 0) {
2495 Py_UCS4 ch = *s++;
2496#ifndef Py_UNICODE_WIDE
2497 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2498 Py_UCS4 ch2 = *s;
2499 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2500 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2501 s++;
2502 size--;
2503 }
2504 }
2505#endif
2506 STORECHAR(ch);
2507 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002508
2509 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002510 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002511 Py_DECREF(v);
2512 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002513#undef STORECHAR
2514}
2515
2516PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2517{
2518 if (!PyUnicode_Check(unicode)) {
2519 PyErr_BadArgument();
2520 return NULL;
2521 }
2522 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2523 PyUnicode_GET_SIZE(unicode),
2524 NULL,
2525 0);
2526}
2527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-16 Codec ------------------------------------------------------- */
2529
Tim Peters772747b2001-08-09 22:21:55 +00002530PyObject *
2531PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002532 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002533 const char *errors,
2534 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535{
Walter Dörwald69652032004-09-07 20:24:22 +00002536 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2537}
2538
2539PyObject *
2540PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002541 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002542 const char *errors,
2543 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002547 Py_ssize_t startinpos;
2548 Py_ssize_t endinpos;
2549 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 PyUnicodeObject *unicode;
2551 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002552 const unsigned char *q, *e;
2553 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002554 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002555 /* Offsets from q for retrieving byte pairs in the right order. */
2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2557 int ihi = 1, ilo = 0;
2558#else
2559 int ihi = 0, ilo = 1;
2560#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 PyObject *errorHandler = NULL;
2562 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563
2564 /* Note: size will always be longer than the resulting Unicode
2565 character count */
2566 unicode = _PyUnicode_New(size);
2567 if (!unicode)
2568 return NULL;
2569 if (size == 0)
2570 return (PyObject *)unicode;
2571
2572 /* Unpack UTF-16 encoded data */
2573 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002574 q = (unsigned char *)s;
2575 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576
2577 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002578 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002580 /* Check for BOM marks (U+FEFF) in the input and adjust current
2581 byte order setting accordingly. In native mode, the leading BOM
2582 mark is skipped, in all other modes, it is copied to the output
2583 stream as-is (giving a ZWNBSP character). */
2584 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002585 if (size >= 2) {
2586 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002587#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002588 if (bom == 0xFEFF) {
2589 q += 2;
2590 bo = -1;
2591 }
2592 else if (bom == 0xFFFE) {
2593 q += 2;
2594 bo = 1;
2595 }
Tim Petersced69f82003-09-16 20:30:58 +00002596#else
Walter Dörwald69652032004-09-07 20:24:22 +00002597 if (bom == 0xFEFF) {
2598 q += 2;
2599 bo = 1;
2600 }
2601 else if (bom == 0xFFFE) {
2602 q += 2;
2603 bo = -1;
2604 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002605#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002606 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
Tim Peters772747b2001-08-09 22:21:55 +00002609 if (bo == -1) {
2610 /* force LE */
2611 ihi = 1;
2612 ilo = 0;
2613 }
2614 else if (bo == 1) {
2615 /* force BE */
2616 ihi = 0;
2617 ilo = 1;
2618 }
2619
2620 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002622 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002624 if (consumed)
2625 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 errmsg = "truncated data";
2627 startinpos = ((const char *)q)-starts;
2628 endinpos = ((const char *)e)-starts;
2629 goto utf16Error;
2630 /* The remaining input chars are ignored if the callback
2631 chooses to skip the input */
2632 }
2633 ch = (q[ihi] << 8) | q[ilo];
2634
Tim Peters772747b2001-08-09 22:21:55 +00002635 q += 2;
2636
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 if (ch < 0xD800 || ch > 0xDFFF) {
2638 *p++ = ch;
2639 continue;
2640 }
2641
2642 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002643 if (q >= e) {
2644 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 startinpos = (((const char *)q)-2)-starts;
2646 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002647 goto utf16Error;
2648 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002649 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002650 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2651 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002652 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002653#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002654 *p++ = ch;
2655 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656#else
2657 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002658#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002659 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002660 }
2661 else {
2662 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 startinpos = (((const char *)q)-4)-starts;
2664 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002665 goto utf16Error;
2666 }
2667
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002669 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 startinpos = (((const char *)q)-2)-starts;
2671 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002672 /* Fall through to report the error */
2673
2674 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 outpos = p-PyUnicode_AS_UNICODE(unicode);
2676 if (unicode_decode_call_errorhandler(
2677 errors, &errorHandler,
2678 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002679 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 }
2683
2684 if (byteorder)
2685 *byteorder = bo;
2686
Walter Dörwald69652032004-09-07 20:24:22 +00002687 if (consumed)
2688 *consumed = (const char *)q-starts;
2689
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002691 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 goto onError;
2693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 Py_XDECREF(errorHandler);
2695 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 return (PyObject *)unicode;
2697
2698onError:
2699 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 Py_XDECREF(errorHandler);
2701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 return NULL;
2703}
2704
Tim Peters772747b2001-08-09 22:21:55 +00002705PyObject *
2706PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002707 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002708 const char *errors,
2709 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002711 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002712 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002713#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002714 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002715#else
2716 const int pairs = 0;
2717#endif
Tim Peters772747b2001-08-09 22:21:55 +00002718 /* Offsets from p for storing byte pairs in the right order. */
2719#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2720 int ihi = 1, ilo = 0;
2721#else
2722 int ihi = 0, ilo = 1;
2723#endif
2724
2725#define STORECHAR(CH) \
2726 do { \
2727 p[ihi] = ((CH) >> 8) & 0xff; \
2728 p[ilo] = (CH) & 0xff; \
2729 p += 2; \
2730 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002732#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002733 for (i = pairs = 0; i < size; i++)
2734 if (s[i] >= 0x10000)
2735 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002736#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002737 v = PyByteArray_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002738 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 if (v == NULL)
2740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741
Christian Heimes9c4756e2008-05-26 13:22:05 +00002742 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002744 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002745 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002746 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002747
2748 if (byteorder == -1) {
2749 /* force LE */
2750 ihi = 1;
2751 ilo = 0;
2752 }
2753 else if (byteorder == 1) {
2754 /* force BE */
2755 ihi = 0;
2756 ilo = 1;
2757 }
2758
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002759 while (size-- > 0) {
2760 Py_UNICODE ch = *s++;
2761 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002762#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002763 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002764 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2765 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002767#endif
Tim Peters772747b2001-08-09 22:21:55 +00002768 STORECHAR(ch);
2769 if (ch2)
2770 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002771 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002772
2773 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002774 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002775 Py_DECREF(v);
2776 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002777#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778}
2779
2780PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2781{
2782 if (!PyUnicode_Check(unicode)) {
2783 PyErr_BadArgument();
2784 return NULL;
2785 }
2786 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2787 PyUnicode_GET_SIZE(unicode),
2788 NULL,
2789 0);
2790}
2791
2792/* --- Unicode Escape Codec ----------------------------------------------- */
2793
Fredrik Lundh06d12682001-01-24 07:59:11 +00002794static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002795
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 const char *errors)
2799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 Py_ssize_t startinpos;
2802 Py_ssize_t endinpos;
2803 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 char* message;
2809 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 PyObject *errorHandler = NULL;
2811 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 /* Escaped strings will always be longer than the resulting
2814 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 length after conversion to the true value.
2816 (but if the error callback returns a long replacement string
2817 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 v = _PyUnicode_New(size);
2819 if (v == NULL)
2820 goto onError;
2821 if (size == 0)
2822 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 while (s < end) {
2828 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002829 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831
2832 /* Non-escape characters are interpreted as Unicode ordinals */
2833 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002834 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 continue;
2836 }
2837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 /* \ - Escapes */
2840 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002841 c = *s++;
2842 if (s > end)
2843 c = '\0'; /* Invalid after \ */
2844 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845
2846 /* \x escapes */
2847 case '\n': break;
2848 case '\\': *p++ = '\\'; break;
2849 case '\'': *p++ = '\''; break;
2850 case '\"': *p++ = '\"'; break;
2851 case 'b': *p++ = '\b'; break;
2852 case 'f': *p++ = '\014'; break; /* FF */
2853 case 't': *p++ = '\t'; break;
2854 case 'n': *p++ = '\n'; break;
2855 case 'r': *p++ = '\r'; break;
2856 case 'v': *p++ = '\013'; break; /* VT */
2857 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2858
2859 /* \OOO (octal) escapes */
2860 case '0': case '1': case '2': case '3':
2861 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002862 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002863 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002864 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002865 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002866 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002868 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 break;
2870
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 /* hex escapes */
2872 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002874 digits = 2;
2875 message = "truncated \\xXX escape";
2876 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880 digits = 4;
2881 message = "truncated \\uXXXX escape";
2882 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002885 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 digits = 8;
2887 message = "truncated \\UXXXXXXXX escape";
2888 hexescape:
2889 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 outpos = p-PyUnicode_AS_UNICODE(v);
2891 if (s+digits>end) {
2892 endinpos = size;
2893 if (unicode_decode_call_errorhandler(
2894 errors, &errorHandler,
2895 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002896 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 (PyObject **)&v, &outpos, &p))
2898 goto onError;
2899 goto nextByte;
2900 }
2901 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002902 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002903 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 endinpos = (s+i+1)-starts;
2905 if (unicode_decode_call_errorhandler(
2906 errors, &errorHandler,
2907 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002908 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002910 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002912 }
2913 chr = (chr<<4) & ~0xF;
2914 if (c >= '0' && c <= '9')
2915 chr += c - '0';
2916 else if (c >= 'a' && c <= 'f')
2917 chr += 10 + c - 'a';
2918 else
2919 chr += 10 + c - 'A';
2920 }
2921 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002922 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 /* _decoding_error will have already written into the
2924 target buffer. */
2925 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002927 /* when we get here, chr is a 32-bit unicode character */
2928 if (chr <= 0xffff)
2929 /* UCS-2 character */
2930 *p++ = (Py_UNICODE) chr;
2931 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002932 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002933 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002934#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002935 *p++ = chr;
2936#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002937 chr -= 0x10000L;
2938 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002939 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002940#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002941 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 endinpos = s-starts;
2943 outpos = p-PyUnicode_AS_UNICODE(v);
2944 if (unicode_decode_call_errorhandler(
2945 errors, &errorHandler,
2946 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002947 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002949 goto onError;
2950 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002951 break;
2952
2953 /* \N{name} */
2954 case 'N':
2955 message = "malformed \\N character escape";
2956 if (ucnhash_CAPI == NULL) {
2957 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002958 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002959 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002960 if (m == NULL)
2961 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002962 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002963 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002964 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002965 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002966 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002967 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002968 if (ucnhash_CAPI == NULL)
2969 goto ucnhashError;
2970 }
2971 if (*s == '{') {
2972 const char *start = s+1;
2973 /* look for the closing brace */
2974 while (*s != '}' && s < end)
2975 s++;
2976 if (s > start && s < end && *s == '}') {
2977 /* found a name. look it up in the unicode database */
2978 message = "unknown Unicode character name";
2979 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002980 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002981 goto store;
2982 }
2983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 endinpos = s-starts;
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 if (unicode_decode_call_errorhandler(
2987 errors, &errorHandler,
2988 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002989 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002991 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002992 break;
2993
2994 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002995 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 message = "\\ at end of string";
2997 s--;
2998 endinpos = s-starts;
2999 outpos = p-PyUnicode_AS_UNICODE(v);
3000 if (unicode_decode_call_errorhandler(
3001 errors, &errorHandler,
3002 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003003 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003005 goto onError;
3006 }
3007 else {
3008 *p++ = '\\';
3009 *p++ = (unsigned char)s[-1];
3010 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003011 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 nextByte:
3014 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003016 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003018 Py_XDECREF(errorHandler);
3019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003021
Fredrik Lundhccc74732001-02-18 22:13:49 +00003022ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003023 PyErr_SetString(
3024 PyExc_UnicodeError,
3025 "\\N escapes not supported (can't load unicodedata module)"
3026 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003027 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 Py_XDECREF(errorHandler);
3029 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003030 return NULL;
3031
Fredrik Lundhccc74732001-02-18 22:13:49 +00003032onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 Py_XDECREF(errorHandler);
3035 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 return NULL;
3037}
3038
3039/* Return a Unicode-Escape string version of the Unicode object.
3040
3041 If quotes is true, the string is enclosed in u"" or u'' quotes as
3042 appropriate.
3043
3044*/
3045
Thomas Wouters477c8d52006-05-27 19:21:47 +00003046Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3047 Py_ssize_t size,
3048 Py_UNICODE ch)
3049{
3050 /* like wcschr, but doesn't stop at NULL characters */
3051
3052 while (size-- > 0) {
3053 if (*s == ch)
3054 return s;
3055 s++;
3056 }
3057
3058 return NULL;
3059}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003060
Walter Dörwald79e913e2007-05-12 11:08:06 +00003061static const char *hexdigits = "0123456789abcdef";
3062
3063PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3064 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003066 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068
Thomas Wouters89f507f2006-12-13 04:49:30 +00003069 /* XXX(nnorwitz): rather than over-allocating, it would be
3070 better to choose a different scheme. Perhaps scan the
3071 first N-chars of the string and allocate based on that size.
3072 */
3073 /* Initial allocation is based on the longest-possible unichr
3074 escape.
3075
3076 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3077 unichr, so in this case it's the longest unichr escape. In
3078 narrow (UTF-16) builds this is five chars per source unichr
3079 since there are two unichrs in the surrogate pair, so in narrow
3080 (UTF-16) builds it's not the longest unichr escape.
3081
3082 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3083 so in the narrow (UTF-16) build case it's the longest unichr
3084 escape.
3085 */
3086
Christian Heimes9c4756e2008-05-26 13:22:05 +00003087 repr = PyByteArray_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003088#ifdef Py_UNICODE_WIDE
3089 + 10*size
3090#else
3091 + 6*size
3092#endif
3093 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 if (repr == NULL)
3095 return NULL;
3096
Christian Heimes9c4756e2008-05-26 13:22:05 +00003097 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 while (size-- > 0) {
3100 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003101
Walter Dörwald79e913e2007-05-12 11:08:06 +00003102 /* Escape backslashes */
3103 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 *p++ = '\\';
3105 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003106 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003108
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003109#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003110 /* Map 21-bit characters to '\U00xxxxxx' */
3111 else if (ch >= 0x10000) {
3112 *p++ = '\\';
3113 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003114 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3115 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3116 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3117 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3118 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3119 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3120 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3121 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003122 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003123 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003124#else
3125 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003126 else if (ch >= 0xD800 && ch < 0xDC00) {
3127 Py_UNICODE ch2;
3128 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003129
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003130 ch2 = *s++;
3131 size--;
3132 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3133 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3134 *p++ = '\\';
3135 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003136 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3137 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3138 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3139 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3140 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3141 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3142 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3143 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003144 continue;
3145 }
3146 /* Fall through: isolated surrogates are copied as-is */
3147 s--;
3148 size++;
3149 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003150#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003151
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003153 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 *p++ = '\\';
3155 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003156 *p++ = hexdigits[(ch >> 12) & 0x000F];
3157 *p++ = hexdigits[(ch >> 8) & 0x000F];
3158 *p++ = hexdigits[(ch >> 4) & 0x000F];
3159 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003161
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003162 /* Map special whitespace to '\t', \n', '\r' */
3163 else if (ch == '\t') {
3164 *p++ = '\\';
3165 *p++ = 't';
3166 }
3167 else if (ch == '\n') {
3168 *p++ = '\\';
3169 *p++ = 'n';
3170 }
3171 else if (ch == '\r') {
3172 *p++ = '\\';
3173 *p++ = 'r';
3174 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003175
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003176 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003177 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003179 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003180 *p++ = hexdigits[(ch >> 4) & 0x000F];
3181 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003182 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003183
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 /* Copy everything else as-is */
3185 else
3186 *p++ = (char) ch;
3187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188
Christian Heimes72b710a2008-05-26 13:28:38 +00003189 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003190 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003191 Py_DECREF(repr);
3192 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193}
3194
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3196{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003197 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 if (!PyUnicode_Check(unicode)) {
3199 PyErr_BadArgument();
3200 return NULL;
3201 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003202 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3203 PyUnicode_GET_SIZE(unicode));
3204
3205 if (!s)
3206 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003207 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003208 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003209 Py_DECREF(s);
3210 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211}
3212
3213/* --- Raw Unicode Escape Codec ------------------------------------------- */
3214
3215PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003216 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 const char *errors)
3218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t startinpos;
3221 Py_ssize_t endinpos;
3222 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 const char *end;
3226 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 PyObject *errorHandler = NULL;
3228 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003229
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 /* Escaped strings will always be longer than the resulting
3231 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 length after conversion to the true value. (But decoding error
3233 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 v = _PyUnicode_New(size);
3235 if (v == NULL)
3236 goto onError;
3237 if (size == 0)
3238 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 end = s + size;
3241 while (s < end) {
3242 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003243 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003245 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246
3247 /* Non-escape characters are interpreted as Unicode ordinals */
3248 if (*s != '\\') {
3249 *p++ = (unsigned char)*s++;
3250 continue;
3251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253
3254 /* \u-escapes are only interpreted iff the number of leading
3255 backslashes if odd */
3256 bs = s;
3257 for (;s < end;) {
3258 if (*s != '\\')
3259 break;
3260 *p++ = (unsigned char)*s++;
3261 }
3262 if (((s - bs) & 1) == 0 ||
3263 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003264 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 continue;
3266 }
3267 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003268 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 s++;
3270
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003271 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003273 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003275 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 endinpos = s-starts;
3277 if (unicode_decode_call_errorhandler(
3278 errors, &errorHandler,
3279 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003280 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 }
3285 x = (x<<4) & ~0xF;
3286 if (c >= '0' && c <= '9')
3287 x += c - '0';
3288 else if (c >= 'a' && c <= 'f')
3289 x += 10 + c - 'a';
3290 else
3291 x += 10 + c - 'A';
3292 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003293 if (x <= 0xffff)
3294 /* UCS-2 character */
3295 *p++ = (Py_UNICODE) x;
3296 else if (x <= 0x10ffff) {
3297 /* UCS-4 character. Either store directly, or as
3298 surrogate pair. */
3299#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003300 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003301#else
3302 x -= 0x10000L;
3303 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3304 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3305#endif
3306 } else {
3307 endinpos = s-starts;
3308 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003309 if (unicode_decode_call_errorhandler(
3310 errors, &errorHandler,
3311 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003312 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003313 (PyObject **)&v, &outpos, &p))
3314 goto onError;
3315 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 nextByte:
3317 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003319 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 Py_XDECREF(errorHandler);
3322 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003324
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 onError:
3326 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 Py_XDECREF(errorHandler);
3328 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 return NULL;
3330}
3331
3332PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003335 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 char *p;
3337 char *q;
3338
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003339#ifdef Py_UNICODE_WIDE
Christian Heimes9c4756e2008-05-26 13:22:05 +00003340 repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341#else
Christian Heimes9c4756e2008-05-26 13:22:05 +00003342 repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003343#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 if (repr == NULL)
3345 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003346 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003347 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348
Christian Heimes9c4756e2008-05-26 13:22:05 +00003349 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 while (size-- > 0) {
3351 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003352#ifdef Py_UNICODE_WIDE
3353 /* Map 32-bit characters to '\Uxxxxxxxx' */
3354 if (ch >= 0x10000) {
3355 *p++ = '\\';
3356 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003357 *p++ = hexdigits[(ch >> 28) & 0xf];
3358 *p++ = hexdigits[(ch >> 24) & 0xf];
3359 *p++ = hexdigits[(ch >> 20) & 0xf];
3360 *p++ = hexdigits[(ch >> 16) & 0xf];
3361 *p++ = hexdigits[(ch >> 12) & 0xf];
3362 *p++ = hexdigits[(ch >> 8) & 0xf];
3363 *p++ = hexdigits[(ch >> 4) & 0xf];
3364 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003365 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003366 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003367#else
3368 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3369 if (ch >= 0xD800 && ch < 0xDC00) {
3370 Py_UNICODE ch2;
3371 Py_UCS4 ucs;
3372
3373 ch2 = *s++;
3374 size--;
3375 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3376 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3377 *p++ = '\\';
3378 *p++ = 'U';
3379 *p++ = hexdigits[(ucs >> 28) & 0xf];
3380 *p++ = hexdigits[(ucs >> 24) & 0xf];
3381 *p++ = hexdigits[(ucs >> 20) & 0xf];
3382 *p++ = hexdigits[(ucs >> 16) & 0xf];
3383 *p++ = hexdigits[(ucs >> 12) & 0xf];
3384 *p++ = hexdigits[(ucs >> 8) & 0xf];
3385 *p++ = hexdigits[(ucs >> 4) & 0xf];
3386 *p++ = hexdigits[ucs & 0xf];
3387 continue;
3388 }
3389 /* Fall through: isolated surrogates are copied as-is */
3390 s--;
3391 size++;
3392 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003393#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 /* Map 16-bit characters to '\uxxxx' */
3395 if (ch >= 256) {
3396 *p++ = '\\';
3397 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003398 *p++ = hexdigits[(ch >> 12) & 0xf];
3399 *p++ = hexdigits[(ch >> 8) & 0xf];
3400 *p++ = hexdigits[(ch >> 4) & 0xf];
3401 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 }
3403 /* Copy everything else as-is */
3404 else
3405 *p++ = (char) ch;
3406 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003407 size = p - q;
3408
3409 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003410 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003411 Py_DECREF(repr);
3412 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413}
3414
3415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3416{
Walter Dörwald711005d2007-05-12 12:03:26 +00003417 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003419 PyErr_BadArgument();
3420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003422 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3423 PyUnicode_GET_SIZE(unicode));
3424
3425 if (!s)
3426 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003427 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003428 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003429 Py_DECREF(s);
3430 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431}
3432
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003433/* --- Unicode Internal Codec ------------------------------------------- */
3434
3435PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 const char *errors)
3438{
3439 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 Py_ssize_t startinpos;
3441 Py_ssize_t endinpos;
3442 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 PyUnicodeObject *v;
3444 Py_UNICODE *p;
3445 const char *end;
3446 const char *reason;
3447 PyObject *errorHandler = NULL;
3448 PyObject *exc = NULL;
3449
Neal Norwitzd43069c2006-01-08 01:12:10 +00003450#ifdef Py_UNICODE_WIDE
3451 Py_UNICODE unimax = PyUnicode_GetMax();
3452#endif
3453
Thomas Wouters89f507f2006-12-13 04:49:30 +00003454 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003455 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3456 if (v == NULL)
3457 goto onError;
3458 if (PyUnicode_GetSize((PyObject *)v) == 0)
3459 return (PyObject *)v;
3460 p = PyUnicode_AS_UNICODE(v);
3461 end = s + size;
3462
3463 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003464 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003465 /* We have to sanity check the raw data, otherwise doom looms for
3466 some malformed UCS-4 data. */
3467 if (
3468 #ifdef Py_UNICODE_WIDE
3469 *p > unimax || *p < 0 ||
3470 #endif
3471 end-s < Py_UNICODE_SIZE
3472 )
3473 {
3474 startinpos = s - starts;
3475 if (end-s < Py_UNICODE_SIZE) {
3476 endinpos = end-starts;
3477 reason = "truncated input";
3478 }
3479 else {
3480 endinpos = s - starts + Py_UNICODE_SIZE;
3481 reason = "illegal code point (> 0x10FFFF)";
3482 }
3483 outpos = p - PyUnicode_AS_UNICODE(v);
3484 if (unicode_decode_call_errorhandler(
3485 errors, &errorHandler,
3486 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003487 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003488 (PyObject **)&v, &outpos, &p)) {
3489 goto onError;
3490 }
3491 }
3492 else {
3493 p++;
3494 s += Py_UNICODE_SIZE;
3495 }
3496 }
3497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003498 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003499 goto onError;
3500 Py_XDECREF(errorHandler);
3501 Py_XDECREF(exc);
3502 return (PyObject *)v;
3503
3504 onError:
3505 Py_XDECREF(v);
3506 Py_XDECREF(errorHandler);
3507 Py_XDECREF(exc);
3508 return NULL;
3509}
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511/* --- Latin-1 Codec ------------------------------------------------------ */
3512
3513PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 const char *errors)
3516{
3517 PyUnicodeObject *v;
3518 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003521 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003522 Py_UNICODE r = *(unsigned char*)s;
3523 return PyUnicode_FromUnicode(&r, 1);
3524 }
3525
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 v = _PyUnicode_New(size);
3527 if (v == NULL)
3528 goto onError;
3529 if (size == 0)
3530 return (PyObject *)v;
3531 p = PyUnicode_AS_UNICODE(v);
3532 while (size-- > 0)
3533 *p++ = (unsigned char)*s++;
3534 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 onError:
3537 Py_XDECREF(v);
3538 return NULL;
3539}
3540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541/* create or adjust a UnicodeEncodeError */
3542static void make_encode_exception(PyObject **exceptionObject,
3543 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003544 const Py_UNICODE *unicode, Py_ssize_t size,
3545 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (*exceptionObject == NULL) {
3549 *exceptionObject = PyUnicodeEncodeError_Create(
3550 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 }
3552 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3554 goto onError;
3555 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3556 goto onError;
3557 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3558 goto onError;
3559 return;
3560 onError:
3561 Py_DECREF(*exceptionObject);
3562 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 }
3564}
3565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566/* raises a UnicodeEncodeError */
3567static void raise_encode_exception(PyObject **exceptionObject,
3568 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 const Py_UNICODE *unicode, Py_ssize_t size,
3570 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 const char *reason)
3572{
3573 make_encode_exception(exceptionObject,
3574 encoding, unicode, size, startpos, endpos, reason);
3575 if (*exceptionObject != NULL)
3576 PyCodec_StrictErrors(*exceptionObject);
3577}
3578
3579/* error handling callback helper:
3580 build arguments, call the callback and check the arguments,
3581 put the result into newpos and return the replacement string, which
3582 has to be freed by the caller */
3583static PyObject *unicode_encode_call_errorhandler(const char *errors,
3584 PyObject **errorHandler,
3585 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003586 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3587 Py_ssize_t startpos, Py_ssize_t endpos,
3588 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591
3592 PyObject *restuple;
3593 PyObject *resunicode;
3594
3595 if (*errorHandler == NULL) {
3596 *errorHandler = PyCodec_LookupError(errors);
3597 if (*errorHandler == NULL)
3598 return NULL;
3599 }
3600
3601 make_encode_exception(exceptionObject,
3602 encoding, unicode, size, startpos, endpos, reason);
3603 if (*exceptionObject == NULL)
3604 return NULL;
3605
3606 restuple = PyObject_CallFunctionObjArgs(
3607 *errorHandler, *exceptionObject, NULL);
3608 if (restuple == NULL)
3609 return NULL;
3610 if (!PyTuple_Check(restuple)) {
3611 PyErr_Format(PyExc_TypeError, &argparse[4]);
3612 Py_DECREF(restuple);
3613 return NULL;
3614 }
3615 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3616 &resunicode, newpos)) {
3617 Py_DECREF(restuple);
3618 return NULL;
3619 }
3620 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003621 *newpos = size+*newpos;
3622 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003624 Py_DECREF(restuple);
3625 return NULL;
3626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 Py_INCREF(resunicode);
3628 Py_DECREF(restuple);
3629 return resunicode;
3630}
3631
3632static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003633 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 const char *errors,
3635 int limit)
3636{
3637 /* output object */
3638 PyObject *res;
3639 /* pointers to the beginning and end+1 of input */
3640 const Py_UNICODE *startp = p;
3641 const Py_UNICODE *endp = p + size;
3642 /* pointer to the beginning of the unencodable characters */
3643 /* const Py_UNICODE *badp = NULL; */
3644 /* pointer into the output */
3645 char *str;
3646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003647 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003648 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 PyObject *errorHandler = NULL;
3651 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003652 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 /* the following variable is used for caching string comparisons
3654 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3655 int known_errorHandler = -1;
3656
3657 /* allocate enough for a simple encoding without
3658 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003659 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003660 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003661 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003663 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003664 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 ressize = size;
3666
3667 while (p<endp) {
3668 Py_UNICODE c = *p;
3669
3670 /* can we encode this? */
3671 if (c<limit) {
3672 /* no overflow check, because we know that the space is enough */
3673 *str++ = (char)c;
3674 ++p;
3675 }
3676 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003677 Py_ssize_t unicodepos = p-startp;
3678 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003680 Py_ssize_t repsize;
3681 Py_ssize_t newpos;
3682 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 Py_UNICODE *uni2;
3684 /* startpos for collecting unencodable chars */
3685 const Py_UNICODE *collstart = p;
3686 const Py_UNICODE *collend = p;
3687 /* find all unecodable characters */
3688 while ((collend < endp) && ((*collend)>=limit))
3689 ++collend;
3690 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3691 if (known_errorHandler==-1) {
3692 if ((errors==NULL) || (!strcmp(errors, "strict")))
3693 known_errorHandler = 1;
3694 else if (!strcmp(errors, "replace"))
3695 known_errorHandler = 2;
3696 else if (!strcmp(errors, "ignore"))
3697 known_errorHandler = 3;
3698 else if (!strcmp(errors, "xmlcharrefreplace"))
3699 known_errorHandler = 4;
3700 else
3701 known_errorHandler = 0;
3702 }
3703 switch (known_errorHandler) {
3704 case 1: /* strict */
3705 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3706 goto onError;
3707 case 2: /* replace */
3708 while (collstart++<collend)
3709 *str++ = '?'; /* fall through */
3710 case 3: /* ignore */
3711 p = collend;
3712 break;
3713 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003714 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 /* determine replacement size (temporarily (mis)uses p) */
3716 for (p = collstart, repsize = 0; p < collend; ++p) {
3717 if (*p<10)
3718 repsize += 2+1+1;
3719 else if (*p<100)
3720 repsize += 2+2+1;
3721 else if (*p<1000)
3722 repsize += 2+3+1;
3723 else if (*p<10000)
3724 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003725#ifndef Py_UNICODE_WIDE
3726 else
3727 repsize += 2+5+1;
3728#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 else if (*p<100000)
3730 repsize += 2+5+1;
3731 else if (*p<1000000)
3732 repsize += 2+6+1;
3733 else
3734 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003735#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 }
3737 requiredsize = respos+repsize+(endp-collend);
3738 if (requiredsize > ressize) {
3739 if (requiredsize<2*ressize)
3740 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003741 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003743 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 ressize = requiredsize;
3745 }
3746 /* generate replacement (temporarily (mis)uses p) */
3747 for (p = collstart; p < collend; ++p) {
3748 str += sprintf(str, "&#%d;", (int)*p);
3749 }
3750 p = collend;
3751 break;
3752 default:
3753 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3754 encoding, reason, startp, size, &exc,
3755 collstart-startp, collend-startp, &newpos);
3756 if (repunicode == NULL)
3757 goto onError;
3758 /* need more space? (at least enough for what we
3759 have+the replacement+the rest of the string, so
3760 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003761 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 repsize = PyUnicode_GET_SIZE(repunicode);
3763 requiredsize = respos+repsize+(endp-collend);
3764 if (requiredsize > ressize) {
3765 if (requiredsize<2*ressize)
3766 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003767 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 Py_DECREF(repunicode);
3769 goto onError;
3770 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003771 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 ressize = requiredsize;
3773 }
3774 /* check if there is anything unencodable in the replacement
3775 and copy it to the output */
3776 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3777 c = *uni2;
3778 if (c >= limit) {
3779 raise_encode_exception(&exc, encoding, startp, size,
3780 unicodepos, unicodepos+1, reason);
3781 Py_DECREF(repunicode);
3782 goto onError;
3783 }
3784 *str = (char)c;
3785 }
3786 p = startp + newpos;
3787 Py_DECREF(repunicode);
3788 }
3789 }
3790 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003791 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003792 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003793 onError:
3794 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 Py_XDECREF(errorHandler);
3796 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003797 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798}
3799
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003801 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 const char *errors)
3803{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805}
3806
3807PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3808{
3809 if (!PyUnicode_Check(unicode)) {
3810 PyErr_BadArgument();
3811 return NULL;
3812 }
3813 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3814 PyUnicode_GET_SIZE(unicode),
3815 NULL);
3816}
3817
3818/* --- 7-bit ASCII Codec -------------------------------------------------- */
3819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 const char *errors)
3823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 PyUnicodeObject *v;
3826 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003827 Py_ssize_t startinpos;
3828 Py_ssize_t endinpos;
3829 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 const char *e;
3831 PyObject *errorHandler = NULL;
3832 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003835 if (size == 1 && *(unsigned char*)s < 128) {
3836 Py_UNICODE r = *(unsigned char*)s;
3837 return PyUnicode_FromUnicode(&r, 1);
3838 }
Tim Petersced69f82003-09-16 20:30:58 +00003839
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 v = _PyUnicode_New(size);
3841 if (v == NULL)
3842 goto onError;
3843 if (size == 0)
3844 return (PyObject *)v;
3845 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 e = s + size;
3847 while (s < e) {
3848 register unsigned char c = (unsigned char)*s;
3849 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851 ++s;
3852 }
3853 else {
3854 startinpos = s-starts;
3855 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003856 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 if (unicode_decode_call_errorhandler(
3858 errors, &errorHandler,
3859 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003860 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003866 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003867 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 Py_XDECREF(errorHandler);
3869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003871
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 onError:
3873 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 Py_XDECREF(errorHandler);
3875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 return NULL;
3877}
3878
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003880 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 const char *errors)
3882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884}
3885
3886PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3887{
3888 if (!PyUnicode_Check(unicode)) {
3889 PyErr_BadArgument();
3890 return NULL;
3891 }
3892 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3893 PyUnicode_GET_SIZE(unicode),
3894 NULL);
3895}
3896
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003897#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003898
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003899/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003901#if SIZEOF_INT < SIZEOF_SSIZE_T
3902#define NEED_RETRY
3903#endif
3904
3905/* XXX This code is limited to "true" double-byte encodings, as
3906 a) it assumes an incomplete character consists of a single byte, and
3907 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3908 encodings, see IsDBCSLeadByteEx documentation. */
3909
3910static int is_dbcs_lead_byte(const char *s, int offset)
3911{
3912 const char *curr = s + offset;
3913
3914 if (IsDBCSLeadByte(*curr)) {
3915 const char *prev = CharPrev(s, curr);
3916 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3917 }
3918 return 0;
3919}
3920
3921/*
3922 * Decode MBCS string into unicode object. If 'final' is set, converts
3923 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3924 */
3925static int decode_mbcs(PyUnicodeObject **v,
3926 const char *s, /* MBCS string */
3927 int size, /* sizeof MBCS string */
3928 int final)
3929{
3930 Py_UNICODE *p;
3931 Py_ssize_t n = 0;
3932 int usize = 0;
3933
3934 assert(size >= 0);
3935
3936 /* Skip trailing lead-byte unless 'final' is set */
3937 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3938 --size;
3939
3940 /* First get the size of the result */
3941 if (size > 0) {
3942 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3943 if (usize == 0) {
3944 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3945 return -1;
3946 }
3947 }
3948
3949 if (*v == NULL) {
3950 /* Create unicode object */
3951 *v = _PyUnicode_New(usize);
3952 if (*v == NULL)
3953 return -1;
3954 }
3955 else {
3956 /* Extend unicode object */
3957 n = PyUnicode_GET_SIZE(*v);
3958 if (_PyUnicode_Resize(v, n + usize) < 0)
3959 return -1;
3960 }
3961
3962 /* Do the conversion */
3963 if (size > 0) {
3964 p = PyUnicode_AS_UNICODE(*v) + n;
3965 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3966 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3967 return -1;
3968 }
3969 }
3970
3971 return size;
3972}
3973
3974PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3975 Py_ssize_t size,
3976 const char *errors,
3977 Py_ssize_t *consumed)
3978{
3979 PyUnicodeObject *v = NULL;
3980 int done;
3981
3982 if (consumed)
3983 *consumed = 0;
3984
3985#ifdef NEED_RETRY
3986 retry:
3987 if (size > INT_MAX)
3988 done = decode_mbcs(&v, s, INT_MAX, 0);
3989 else
3990#endif
3991 done = decode_mbcs(&v, s, (int)size, !consumed);
3992
3993 if (done < 0) {
3994 Py_XDECREF(v);
3995 return NULL;
3996 }
3997
3998 if (consumed)
3999 *consumed += done;
4000
4001#ifdef NEED_RETRY
4002 if (size > INT_MAX) {
4003 s += done;
4004 size -= done;
4005 goto retry;
4006 }
4007#endif
4008
4009 return (PyObject *)v;
4010}
4011
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004012PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004013 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014 const char *errors)
4015{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004016 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4017}
4018
4019/*
4020 * Convert unicode into string object (MBCS).
4021 * Returns 0 if succeed, -1 otherwise.
4022 */
4023static int encode_mbcs(PyObject **repr,
4024 const Py_UNICODE *p, /* unicode */
4025 int size) /* size of unicode */
4026{
4027 int mbcssize = 0;
4028 Py_ssize_t n = 0;
4029
4030 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004031
4032 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004033 if (size > 0) {
4034 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4035 if (mbcssize == 0) {
4036 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4037 return -1;
4038 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004039 }
4040
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004041 if (*repr == NULL) {
4042 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004043 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004044 if (*repr == NULL)
4045 return -1;
4046 }
4047 else {
4048 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004049 n = PyBytes_Size(*repr);
4050 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004051 return -1;
4052 }
4053
4054 /* Do the conversion */
4055 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004056 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004057 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4058 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4059 return -1;
4060 }
4061 }
4062
4063 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004064}
4065
4066PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004068 const char *errors)
4069{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004070 PyObject *repr = NULL;
4071 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004072
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004073#ifdef NEED_RETRY
4074 retry:
4075 if (size > INT_MAX)
4076 ret = encode_mbcs(&repr, p, INT_MAX);
4077 else
4078#endif
4079 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004081 if (ret < 0) {
4082 Py_XDECREF(repr);
4083 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004084 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004085
4086#ifdef NEED_RETRY
4087 if (size > INT_MAX) {
4088 p += INT_MAX;
4089 size -= INT_MAX;
4090 goto retry;
4091 }
4092#endif
4093
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004094 return repr;
4095}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004096
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004097PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4098{
4099 if (!PyUnicode_Check(unicode)) {
4100 PyErr_BadArgument();
4101 return NULL;
4102 }
4103 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4104 PyUnicode_GET_SIZE(unicode),
4105 NULL);
4106}
4107
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004108#undef NEED_RETRY
4109
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004110#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112/* --- Character Mapping Codec -------------------------------------------- */
4113
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004115 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 PyObject *mapping,
4117 const char *errors)
4118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t startinpos;
4121 Py_ssize_t endinpos;
4122 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 PyUnicodeObject *v;
4125 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004126 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 PyObject *errorHandler = NULL;
4128 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004129 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004130 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004131
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 /* Default to Latin-1 */
4133 if (mapping == NULL)
4134 return PyUnicode_DecodeLatin1(s, size, errors);
4135
4136 v = _PyUnicode_New(size);
4137 if (v == NULL)
4138 goto onError;
4139 if (size == 0)
4140 return (PyObject *)v;
4141 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004143 if (PyUnicode_CheckExact(mapping)) {
4144 mapstring = PyUnicode_AS_UNICODE(mapping);
4145 maplen = PyUnicode_GET_SIZE(mapping);
4146 while (s < e) {
4147 unsigned char ch = *s;
4148 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004150 if (ch < maplen)
4151 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004153 if (x == 0xfffe) {
4154 /* undefined mapping */
4155 outpos = p-PyUnicode_AS_UNICODE(v);
4156 startinpos = s-starts;
4157 endinpos = startinpos+1;
4158 if (unicode_decode_call_errorhandler(
4159 errors, &errorHandler,
4160 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004161 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004162 (PyObject **)&v, &outpos, &p)) {
4163 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004164 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004165 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004166 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004167 *p++ = x;
4168 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004170 }
4171 else {
4172 while (s < e) {
4173 unsigned char ch = *s;
4174 PyObject *w, *x;
4175
4176 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004177 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178 if (w == NULL)
4179 goto onError;
4180 x = PyObject_GetItem(mapping, w);
4181 Py_DECREF(w);
4182 if (x == NULL) {
4183 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4184 /* No mapping found means: mapping is undefined. */
4185 PyErr_Clear();
4186 x = Py_None;
4187 Py_INCREF(x);
4188 } else
4189 goto onError;
4190 }
4191
4192 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004193 if (PyLong_Check(x)) {
4194 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004195 if (value < 0 || value > 65535) {
4196 PyErr_SetString(PyExc_TypeError,
4197 "character mapping must be in range(65536)");
4198 Py_DECREF(x);
4199 goto onError;
4200 }
4201 *p++ = (Py_UNICODE)value;
4202 }
4203 else if (x == Py_None) {
4204 /* undefined mapping */
4205 outpos = p-PyUnicode_AS_UNICODE(v);
4206 startinpos = s-starts;
4207 endinpos = startinpos+1;
4208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004211 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004212 (PyObject **)&v, &outpos, &p)) {
4213 Py_DECREF(x);
4214 goto onError;
4215 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004216 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004217 continue;
4218 }
4219 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004220 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004221
4222 if (targetsize == 1)
4223 /* 1-1 mapping */
4224 *p++ = *PyUnicode_AS_UNICODE(x);
4225
4226 else if (targetsize > 1) {
4227 /* 1-n mapping */
4228 if (targetsize > extrachars) {
4229 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004230 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4231 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004232 (targetsize << 2);
4233 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004234 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004235 if (_PyUnicode_Resize(&v,
4236 PyUnicode_GET_SIZE(v) + needed) < 0) {
4237 Py_DECREF(x);
4238 goto onError;
4239 }
4240 p = PyUnicode_AS_UNICODE(v) + oldpos;
4241 }
4242 Py_UNICODE_COPY(p,
4243 PyUnicode_AS_UNICODE(x),
4244 targetsize);
4245 p += targetsize;
4246 extrachars -= targetsize;
4247 }
4248 /* 1-0 mapping: skip the character */
4249 }
4250 else {
4251 /* wrong return value */
4252 PyErr_SetString(PyExc_TypeError,
4253 "character mapping must return integer, None or unicode");
4254 Py_DECREF(x);
4255 goto onError;
4256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004258 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 }
4261 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004262 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 Py_XDECREF(errorHandler);
4265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004267
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 Py_XDECREF(errorHandler);
4270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 Py_XDECREF(v);
4272 return NULL;
4273}
4274
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004275/* Charmap encoding: the lookup table */
4276
4277struct encoding_map{
4278 PyObject_HEAD
4279 unsigned char level1[32];
4280 int count2, count3;
4281 unsigned char level23[1];
4282};
4283
4284static PyObject*
4285encoding_map_size(PyObject *obj, PyObject* args)
4286{
4287 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004288 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004289 128*map->count3);
4290}
4291
4292static PyMethodDef encoding_map_methods[] = {
4293 {"size", encoding_map_size, METH_NOARGS,
4294 PyDoc_STR("Return the size (in bytes) of this object") },
4295 { 0 }
4296};
4297
4298static void
4299encoding_map_dealloc(PyObject* o)
4300{
4301 PyObject_FREE(o);
4302}
4303
4304static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004305 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004306 "EncodingMap", /*tp_name*/
4307 sizeof(struct encoding_map), /*tp_basicsize*/
4308 0, /*tp_itemsize*/
4309 /* methods */
4310 encoding_map_dealloc, /*tp_dealloc*/
4311 0, /*tp_print*/
4312 0, /*tp_getattr*/
4313 0, /*tp_setattr*/
4314 0, /*tp_compare*/
4315 0, /*tp_repr*/
4316 0, /*tp_as_number*/
4317 0, /*tp_as_sequence*/
4318 0, /*tp_as_mapping*/
4319 0, /*tp_hash*/
4320 0, /*tp_call*/
4321 0, /*tp_str*/
4322 0, /*tp_getattro*/
4323 0, /*tp_setattro*/
4324 0, /*tp_as_buffer*/
4325 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4326 0, /*tp_doc*/
4327 0, /*tp_traverse*/
4328 0, /*tp_clear*/
4329 0, /*tp_richcompare*/
4330 0, /*tp_weaklistoffset*/
4331 0, /*tp_iter*/
4332 0, /*tp_iternext*/
4333 encoding_map_methods, /*tp_methods*/
4334 0, /*tp_members*/
4335 0, /*tp_getset*/
4336 0, /*tp_base*/
4337 0, /*tp_dict*/
4338 0, /*tp_descr_get*/
4339 0, /*tp_descr_set*/
4340 0, /*tp_dictoffset*/
4341 0, /*tp_init*/
4342 0, /*tp_alloc*/
4343 0, /*tp_new*/
4344 0, /*tp_free*/
4345 0, /*tp_is_gc*/
4346};
4347
4348PyObject*
4349PyUnicode_BuildEncodingMap(PyObject* string)
4350{
4351 Py_UNICODE *decode;
4352 PyObject *result;
4353 struct encoding_map *mresult;
4354 int i;
4355 int need_dict = 0;
4356 unsigned char level1[32];
4357 unsigned char level2[512];
4358 unsigned char *mlevel1, *mlevel2, *mlevel3;
4359 int count2 = 0, count3 = 0;
4360
4361 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4362 PyErr_BadArgument();
4363 return NULL;
4364 }
4365 decode = PyUnicode_AS_UNICODE(string);
4366 memset(level1, 0xFF, sizeof level1);
4367 memset(level2, 0xFF, sizeof level2);
4368
4369 /* If there isn't a one-to-one mapping of NULL to \0,
4370 or if there are non-BMP characters, we need to use
4371 a mapping dictionary. */
4372 if (decode[0] != 0)
4373 need_dict = 1;
4374 for (i = 1; i < 256; i++) {
4375 int l1, l2;
4376 if (decode[i] == 0
4377 #ifdef Py_UNICODE_WIDE
4378 || decode[i] > 0xFFFF
4379 #endif
4380 ) {
4381 need_dict = 1;
4382 break;
4383 }
4384 if (decode[i] == 0xFFFE)
4385 /* unmapped character */
4386 continue;
4387 l1 = decode[i] >> 11;
4388 l2 = decode[i] >> 7;
4389 if (level1[l1] == 0xFF)
4390 level1[l1] = count2++;
4391 if (level2[l2] == 0xFF)
4392 level2[l2] = count3++;
4393 }
4394
4395 if (count2 >= 0xFF || count3 >= 0xFF)
4396 need_dict = 1;
4397
4398 if (need_dict) {
4399 PyObject *result = PyDict_New();
4400 PyObject *key, *value;
4401 if (!result)
4402 return NULL;
4403 for (i = 0; i < 256; i++) {
4404 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004405 key = PyLong_FromLong(decode[i]);
4406 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004407 if (!key || !value)
4408 goto failed1;
4409 if (PyDict_SetItem(result, key, value) == -1)
4410 goto failed1;
4411 Py_DECREF(key);
4412 Py_DECREF(value);
4413 }
4414 return result;
4415 failed1:
4416 Py_XDECREF(key);
4417 Py_XDECREF(value);
4418 Py_DECREF(result);
4419 return NULL;
4420 }
4421
4422 /* Create a three-level trie */
4423 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4424 16*count2 + 128*count3 - 1);
4425 if (!result)
4426 return PyErr_NoMemory();
4427 PyObject_Init(result, &EncodingMapType);
4428 mresult = (struct encoding_map*)result;
4429 mresult->count2 = count2;
4430 mresult->count3 = count3;
4431 mlevel1 = mresult->level1;
4432 mlevel2 = mresult->level23;
4433 mlevel3 = mresult->level23 + 16*count2;
4434 memcpy(mlevel1, level1, 32);
4435 memset(mlevel2, 0xFF, 16*count2);
4436 memset(mlevel3, 0, 128*count3);
4437 count3 = 0;
4438 for (i = 1; i < 256; i++) {
4439 int o1, o2, o3, i2, i3;
4440 if (decode[i] == 0xFFFE)
4441 /* unmapped character */
4442 continue;
4443 o1 = decode[i]>>11;
4444 o2 = (decode[i]>>7) & 0xF;
4445 i2 = 16*mlevel1[o1] + o2;
4446 if (mlevel2[i2] == 0xFF)
4447 mlevel2[i2] = count3++;
4448 o3 = decode[i] & 0x7F;
4449 i3 = 128*mlevel2[i2] + o3;
4450 mlevel3[i3] = i;
4451 }
4452 return result;
4453}
4454
4455static int
4456encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4457{
4458 struct encoding_map *map = (struct encoding_map*)mapping;
4459 int l1 = c>>11;
4460 int l2 = (c>>7) & 0xF;
4461 int l3 = c & 0x7F;
4462 int i;
4463
4464#ifdef Py_UNICODE_WIDE
4465 if (c > 0xFFFF) {
4466 return -1;
4467 }
4468#endif
4469 if (c == 0)
4470 return 0;
4471 /* level 1*/
4472 i = map->level1[l1];
4473 if (i == 0xFF) {
4474 return -1;
4475 }
4476 /* level 2*/
4477 i = map->level23[16*i+l2];
4478 if (i == 0xFF) {
4479 return -1;
4480 }
4481 /* level 3 */
4482 i = map->level23[16*map->count2 + 128*i + l3];
4483 if (i == 0) {
4484 return -1;
4485 }
4486 return i;
4487}
4488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489/* Lookup the character ch in the mapping. If the character
4490 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004491 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493{
Christian Heimes217cfd12007-12-02 14:31:20 +00004494 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 PyObject *x;
4496
4497 if (w == NULL)
4498 return NULL;
4499 x = PyObject_GetItem(mapping, w);
4500 Py_DECREF(w);
4501 if (x == NULL) {
4502 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4503 /* No mapping found means: mapping is undefined. */
4504 PyErr_Clear();
4505 x = Py_None;
4506 Py_INCREF(x);
4507 return x;
4508 } else
4509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004511 else if (x == Py_None)
4512 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004513 else if (PyLong_Check(x)) {
4514 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 if (value < 0 || value > 255) {
4516 PyErr_SetString(PyExc_TypeError,
4517 "character mapping must be in range(256)");
4518 Py_DECREF(x);
4519 return NULL;
4520 }
4521 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004523 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004527 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004528 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004529 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_DECREF(x);
4531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 }
4533}
4534
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004535static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004536charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004537{
Christian Heimes72b710a2008-05-26 13:28:38 +00004538 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004539 /* exponentially overallocate to minimize reallocations */
4540 if (requiredsize < 2*outsize)
4541 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004542 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004543 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004544 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004545}
4546
4547typedef enum charmapencode_result {
4548 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4549}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004551 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 space is available. Return a new reference to the object that
4553 was put in the output buffer, or Py_None, if the mapping was undefined
4554 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004555 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004557charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004558 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004560 PyObject *rep;
4561 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004562 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563
Christian Heimes90aa7642007-12-19 02:45:37 +00004564 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004565 int res = encoding_map_lookup(c, mapping);
4566 Py_ssize_t requiredsize = *outpos+1;
4567 if (res == -1)
4568 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004569 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004570 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004572 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004573 outstart[(*outpos)++] = (char)res;
4574 return enc_SUCCESS;
4575 }
4576
4577 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004579 return enc_EXCEPTION;
4580 else if (rep==Py_None) {
4581 Py_DECREF(rep);
4582 return enc_FAILED;
4583 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004584 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004586 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004587 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004589 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004591 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004592 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 }
4594 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004595 const char *repchars = PyBytes_AS_STRING(rep);
4596 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004597 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004598 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004599 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004603 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 memcpy(outstart + *outpos, repchars, repsize);
4605 *outpos += repsize;
4606 }
4607 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004608 Py_DECREF(rep);
4609 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610}
4611
4612/* handle an error in PyUnicode_EncodeCharmap
4613 Return 0 on success, -1 on error */
4614static
4615int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004616 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004618 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004619 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620{
4621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004622 Py_ssize_t repsize;
4623 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 Py_UNICODE *uni2;
4625 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 Py_ssize_t collstartpos = *inpos;
4627 Py_ssize_t collendpos = *inpos+1;
4628 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 char *encoding = "charmap";
4630 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004631 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 /* find all unencodable characters */
4634 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004635 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004636 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004637 int res = encoding_map_lookup(p[collendpos], mapping);
4638 if (res != -1)
4639 break;
4640 ++collendpos;
4641 continue;
4642 }
4643
4644 rep = charmapencode_lookup(p[collendpos], mapping);
4645 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004647 else if (rep!=Py_None) {
4648 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 break;
4650 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004651 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 ++collendpos;
4653 }
4654 /* cache callback name lookup
4655 * (if not done yet, i.e. it's the first error) */
4656 if (*known_errorHandler==-1) {
4657 if ((errors==NULL) || (!strcmp(errors, "strict")))
4658 *known_errorHandler = 1;
4659 else if (!strcmp(errors, "replace"))
4660 *known_errorHandler = 2;
4661 else if (!strcmp(errors, "ignore"))
4662 *known_errorHandler = 3;
4663 else if (!strcmp(errors, "xmlcharrefreplace"))
4664 *known_errorHandler = 4;
4665 else
4666 *known_errorHandler = 0;
4667 }
4668 switch (*known_errorHandler) {
4669 case 1: /* strict */
4670 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4671 return -1;
4672 case 2: /* replace */
4673 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4674 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004675 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 return -1;
4677 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004678 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4680 return -1;
4681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 }
4683 /* fall through */
4684 case 3: /* ignore */
4685 *inpos = collendpos;
4686 break;
4687 case 4: /* xmlcharrefreplace */
4688 /* generate replacement (temporarily (mis)uses p) */
4689 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4690 char buffer[2+29+1+1];
4691 char *cp;
4692 sprintf(buffer, "&#%d;", (int)p[collpos]);
4693 for (cp = buffer; *cp; ++cp) {
4694 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004695 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4699 return -1;
4700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 }
4702 }
4703 *inpos = collendpos;
4704 break;
4705 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004706 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 encoding, reason, p, size, exceptionObject,
4708 collstartpos, collendpos, &newpos);
4709 if (repunicode == NULL)
4710 return -1;
4711 /* generate replacement */
4712 repsize = PyUnicode_GET_SIZE(repunicode);
4713 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4714 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004715 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 return -1;
4717 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004718 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4721 return -1;
4722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 }
4724 *inpos = newpos;
4725 Py_DECREF(repunicode);
4726 }
4727 return 0;
4728}
4729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 PyObject *mapping,
4733 const char *errors)
4734{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 /* output object */
4736 PyObject *res = NULL;
4737 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004738 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004740 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 PyObject *errorHandler = NULL;
4742 PyObject *exc = NULL;
4743 /* the following variable is used for caching string comparisons
4744 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4745 * 3=ignore, 4=xmlcharrefreplace */
4746 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748 /* Default to Latin-1 */
4749 if (mapping == NULL)
4750 return PyUnicode_EncodeLatin1(p, size, errors);
4751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 /* allocate enough for a simple encoding without
4753 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004754 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 if (res == NULL)
4756 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004757 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 while (inpos<size) {
4761 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004762 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004763 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004765 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 if (charmap_encoding_error(p, size, &inpos, mapping,
4767 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004768 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004769 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004770 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 else
4774 /* done with this character => adjust input position */
4775 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004779 if (respos<PyBytes_GET_SIZE(res))
4780 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 Py_XDECREF(exc);
4783 Py_XDECREF(errorHandler);
4784 return res;
4785
4786 onError:
4787 Py_XDECREF(res);
4788 Py_XDECREF(exc);
4789 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 return NULL;
4791}
4792
4793PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4794 PyObject *mapping)
4795{
4796 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4797 PyErr_BadArgument();
4798 return NULL;
4799 }
4800 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4801 PyUnicode_GET_SIZE(unicode),
4802 mapping,
4803 NULL);
4804}
4805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806/* create or adjust a UnicodeTranslateError */
4807static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 const Py_UNICODE *unicode, Py_ssize_t size,
4809 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (*exceptionObject == NULL) {
4813 *exceptionObject = PyUnicodeTranslateError_Create(
4814 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
4816 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4818 goto onError;
4819 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4820 goto onError;
4821 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4822 goto onError;
4823 return;
4824 onError:
4825 Py_DECREF(*exceptionObject);
4826 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 }
4828}
4829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830/* raises a UnicodeTranslateError */
4831static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 const Py_UNICODE *unicode, Py_ssize_t size,
4833 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 const char *reason)
4835{
4836 make_translate_exception(exceptionObject,
4837 unicode, size, startpos, endpos, reason);
4838 if (*exceptionObject != NULL)
4839 PyCodec_StrictErrors(*exceptionObject);
4840}
4841
4842/* error handling callback helper:
4843 build arguments, call the callback and check the arguments,
4844 put the result into newpos and return the replacement string, which
4845 has to be freed by the caller */
4846static PyObject *unicode_translate_call_errorhandler(const char *errors,
4847 PyObject **errorHandler,
4848 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004849 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4850 Py_ssize_t startpos, Py_ssize_t endpos,
4851 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004853 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004855 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 PyObject *restuple;
4857 PyObject *resunicode;
4858
4859 if (*errorHandler == NULL) {
4860 *errorHandler = PyCodec_LookupError(errors);
4861 if (*errorHandler == NULL)
4862 return NULL;
4863 }
4864
4865 make_translate_exception(exceptionObject,
4866 unicode, size, startpos, endpos, reason);
4867 if (*exceptionObject == NULL)
4868 return NULL;
4869
4870 restuple = PyObject_CallFunctionObjArgs(
4871 *errorHandler, *exceptionObject, NULL);
4872 if (restuple == NULL)
4873 return NULL;
4874 if (!PyTuple_Check(restuple)) {
4875 PyErr_Format(PyExc_TypeError, &argparse[4]);
4876 Py_DECREF(restuple);
4877 return NULL;
4878 }
4879 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004880 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 Py_DECREF(restuple);
4882 return NULL;
4883 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 if (i_newpos<0)
4885 *newpos = size+i_newpos;
4886 else
4887 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004888 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004889 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004890 Py_DECREF(restuple);
4891 return NULL;
4892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 Py_INCREF(resunicode);
4894 Py_DECREF(restuple);
4895 return resunicode;
4896}
4897
4898/* Lookup the character ch in the mapping and put the result in result,
4899 which must be decrefed by the caller.
4900 Return 0 on success, -1 on error */
4901static
4902int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4903{
Christian Heimes217cfd12007-12-02 14:31:20 +00004904 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 PyObject *x;
4906
4907 if (w == NULL)
4908 return -1;
4909 x = PyObject_GetItem(mapping, w);
4910 Py_DECREF(w);
4911 if (x == NULL) {
4912 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4913 /* No mapping found means: use 1:1 mapping. */
4914 PyErr_Clear();
4915 *result = NULL;
4916 return 0;
4917 } else
4918 return -1;
4919 }
4920 else if (x == Py_None) {
4921 *result = x;
4922 return 0;
4923 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004924 else if (PyLong_Check(x)) {
4925 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 long max = PyUnicode_GetMax();
4927 if (value < 0 || value > max) {
4928 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004929 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 Py_DECREF(x);
4931 return -1;
4932 }
4933 *result = x;
4934 return 0;
4935 }
4936 else if (PyUnicode_Check(x)) {
4937 *result = x;
4938 return 0;
4939 }
4940 else {
4941 /* wrong return value */
4942 PyErr_SetString(PyExc_TypeError,
4943 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004944 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 return -1;
4946 }
4947}
4948/* ensure that *outobj is at least requiredsize characters long,
4949if not reallocate and adjust various state variables.
4950Return 0 on success, -1 on error */
4951static
Walter Dörwald4894c302003-10-24 14:25:28 +00004952int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004953 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004955 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004956 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004958 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004960 if (requiredsize < 2 * oldsize)
4961 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004962 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 return -1;
4964 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 }
4966 return 0;
4967}
4968/* lookup the character, put the result in the output string and adjust
4969 various state variables. Return a new reference to the object that
4970 was put in the output buffer in *result, or Py_None, if the mapping was
4971 undefined (in which case no character was written).
4972 The called must decref result.
4973 Return 0 on success, -1 on error. */
4974static
Walter Dörwald4894c302003-10-24 14:25:28 +00004975int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004976 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004977 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978{
Walter Dörwald4894c302003-10-24 14:25:28 +00004979 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 return -1;
4981 if (*res==NULL) {
4982 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004983 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984 }
4985 else if (*res==Py_None)
4986 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004987 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004989 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990 }
4991 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004992 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 if (repsize==1) {
4994 /* no overflow check, because we know that the space is enough */
4995 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4996 }
4997 else if (repsize!=0) {
4998 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004999 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005000 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005001 repsize - 1;
5002 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 return -1;
5004 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5005 *outp += repsize;
5006 }
5007 }
5008 else
5009 return -1;
5010 return 0;
5011}
5012
5013PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 PyObject *mapping,
5016 const char *errors)
5017{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 /* output object */
5019 PyObject *res = NULL;
5020 /* pointers to the beginning and end+1 of input */
5021 const Py_UNICODE *startp = p;
5022 const Py_UNICODE *endp = p + size;
5023 /* pointer into the output */
5024 Py_UNICODE *str;
5025 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005026 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 char *reason = "character maps to <undefined>";
5028 PyObject *errorHandler = NULL;
5029 PyObject *exc = NULL;
5030 /* the following variable is used for caching string comparisons
5031 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5032 * 3=ignore, 4=xmlcharrefreplace */
5033 int known_errorHandler = -1;
5034
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 if (mapping == NULL) {
5036 PyErr_BadArgument();
5037 return NULL;
5038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039
5040 /* allocate enough for a simple 1:1 translation without
5041 replacements, if we need more, we'll resize */
5042 res = PyUnicode_FromUnicode(NULL, size);
5043 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005044 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 return res;
5047 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 while (p<endp) {
5050 /* try to encode it */
5051 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005052 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 goto onError;
5055 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005056 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 if (x!=Py_None) /* it worked => adjust input pointer */
5058 ++p;
5059 else { /* untranslatable character */
5060 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t repsize;
5062 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 Py_UNICODE *uni2;
5064 /* startpos for collecting untranslatable chars */
5065 const Py_UNICODE *collstart = p;
5066 const Py_UNICODE *collend = p+1;
5067 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 /* find all untranslatable characters */
5070 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005071 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005072 goto onError;
5073 Py_XDECREF(x);
5074 if (x!=Py_None)
5075 break;
5076 ++collend;
5077 }
5078 /* cache callback name lookup
5079 * (if not done yet, i.e. it's the first error) */
5080 if (known_errorHandler==-1) {
5081 if ((errors==NULL) || (!strcmp(errors, "strict")))
5082 known_errorHandler = 1;
5083 else if (!strcmp(errors, "replace"))
5084 known_errorHandler = 2;
5085 else if (!strcmp(errors, "ignore"))
5086 known_errorHandler = 3;
5087 else if (!strcmp(errors, "xmlcharrefreplace"))
5088 known_errorHandler = 4;
5089 else
5090 known_errorHandler = 0;
5091 }
5092 switch (known_errorHandler) {
5093 case 1: /* strict */
5094 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5095 goto onError;
5096 case 2: /* replace */
5097 /* No need to check for space, this is a 1:1 replacement */
5098 for (coll = collstart; coll<collend; ++coll)
5099 *str++ = '?';
5100 /* fall through */
5101 case 3: /* ignore */
5102 p = collend;
5103 break;
5104 case 4: /* xmlcharrefreplace */
5105 /* generate replacement (temporarily (mis)uses p) */
5106 for (p = collstart; p < collend; ++p) {
5107 char buffer[2+29+1+1];
5108 char *cp;
5109 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005110 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5112 goto onError;
5113 for (cp = buffer; *cp; ++cp)
5114 *str++ = *cp;
5115 }
5116 p = collend;
5117 break;
5118 default:
5119 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5120 reason, startp, size, &exc,
5121 collstart-startp, collend-startp, &newpos);
5122 if (repunicode == NULL)
5123 goto onError;
5124 /* generate replacement */
5125 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005126 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005127 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5128 Py_DECREF(repunicode);
5129 goto onError;
5130 }
5131 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5132 *str++ = *uni2;
5133 p = startp + newpos;
5134 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
5136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 /* Resize if we allocated to much */
5139 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005140 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005141 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 }
5144 Py_XDECREF(exc);
5145 Py_XDECREF(errorHandler);
5146 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148 onError:
5149 Py_XDECREF(res);
5150 Py_XDECREF(exc);
5151 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return NULL;
5153}
5154
5155PyObject *PyUnicode_Translate(PyObject *str,
5156 PyObject *mapping,
5157 const char *errors)
5158{
5159 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 str = PyUnicode_FromObject(str);
5162 if (str == NULL)
5163 goto onError;
5164 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5165 PyUnicode_GET_SIZE(str),
5166 mapping,
5167 errors);
5168 Py_DECREF(str);
5169 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005170
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 onError:
5172 Py_XDECREF(str);
5173 return NULL;
5174}
Tim Petersced69f82003-09-16 20:30:58 +00005175
Guido van Rossum9e896b32000-04-05 20:11:21 +00005176/* --- Decimal Encoder ---------------------------------------------------- */
5177
5178int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005179 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005180 char *output,
5181 const char *errors)
5182{
5183 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 PyObject *errorHandler = NULL;
5185 PyObject *exc = NULL;
5186 const char *encoding = "decimal";
5187 const char *reason = "invalid decimal Unicode string";
5188 /* the following variable is used for caching string comparisons
5189 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5190 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005191
5192 if (output == NULL) {
5193 PyErr_BadArgument();
5194 return -1;
5195 }
5196
5197 p = s;
5198 end = s + length;
5199 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005201 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t repsize;
5204 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 Py_UNICODE *uni2;
5206 Py_UNICODE *collstart;
5207 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005208
Guido van Rossum9e896b32000-04-05 20:11:21 +00005209 if (Py_UNICODE_ISSPACE(ch)) {
5210 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005212 continue;
5213 }
5214 decimal = Py_UNICODE_TODECIMAL(ch);
5215 if (decimal >= 0) {
5216 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005218 continue;
5219 }
Guido van Rossumba477042000-04-06 18:18:10 +00005220 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005221 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005223 continue;
5224 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 /* All other characters are considered unencodable */
5226 collstart = p;
5227 collend = p+1;
5228 while (collend < end) {
5229 if ((0 < *collend && *collend < 256) ||
5230 !Py_UNICODE_ISSPACE(*collend) ||
5231 Py_UNICODE_TODECIMAL(*collend))
5232 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 /* cache callback name lookup
5235 * (if not done yet, i.e. it's the first error) */
5236 if (known_errorHandler==-1) {
5237 if ((errors==NULL) || (!strcmp(errors, "strict")))
5238 known_errorHandler = 1;
5239 else if (!strcmp(errors, "replace"))
5240 known_errorHandler = 2;
5241 else if (!strcmp(errors, "ignore"))
5242 known_errorHandler = 3;
5243 else if (!strcmp(errors, "xmlcharrefreplace"))
5244 known_errorHandler = 4;
5245 else
5246 known_errorHandler = 0;
5247 }
5248 switch (known_errorHandler) {
5249 case 1: /* strict */
5250 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5251 goto onError;
5252 case 2: /* replace */
5253 for (p = collstart; p < collend; ++p)
5254 *output++ = '?';
5255 /* fall through */
5256 case 3: /* ignore */
5257 p = collend;
5258 break;
5259 case 4: /* xmlcharrefreplace */
5260 /* generate replacement (temporarily (mis)uses p) */
5261 for (p = collstart; p < collend; ++p)
5262 output += sprintf(output, "&#%d;", (int)*p);
5263 p = collend;
5264 break;
5265 default:
5266 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5267 encoding, reason, s, length, &exc,
5268 collstart-s, collend-s, &newpos);
5269 if (repunicode == NULL)
5270 goto onError;
5271 /* generate replacement */
5272 repsize = PyUnicode_GET_SIZE(repunicode);
5273 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5274 Py_UNICODE ch = *uni2;
5275 if (Py_UNICODE_ISSPACE(ch))
5276 *output++ = ' ';
5277 else {
5278 decimal = Py_UNICODE_TODECIMAL(ch);
5279 if (decimal >= 0)
5280 *output++ = '0' + decimal;
5281 else if (0 < ch && ch < 256)
5282 *output++ = (char)ch;
5283 else {
5284 Py_DECREF(repunicode);
5285 raise_encode_exception(&exc, encoding,
5286 s, length, collstart-s, collend-s, reason);
5287 goto onError;
5288 }
5289 }
5290 }
5291 p = s + newpos;
5292 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005293 }
5294 }
5295 /* 0-terminate the output string */
5296 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005297 Py_XDECREF(exc);
5298 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005299 return 0;
5300
5301 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 Py_XDECREF(exc);
5303 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005304 return -1;
5305}
5306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307/* --- Helpers ------------------------------------------------------------ */
5308
Eric Smith8c663262007-08-25 02:26:07 +00005309#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005310#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005311#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005312/* Include _ParseTupleFinds from find.h */
5313#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005314#include "stringlib/find.h"
5315#include "stringlib/partition.h"
5316
Eric Smith5807c412008-05-11 21:00:57 +00005317#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5318#include "stringlib/localeutil.h"
5319
Thomas Wouters477c8d52006-05-27 19:21:47 +00005320/* helper macro to fixup start/end slice values */
5321#define FIX_START_END(obj) \
5322 if (start < 0) \
5323 start += (obj)->length; \
5324 if (start < 0) \
5325 start = 0; \
5326 if (end > (obj)->length) \
5327 end = (obj)->length; \
5328 if (end < 0) \
5329 end += (obj)->length; \
5330 if (end < 0) \
5331 end = 0;
5332
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005334 PyObject *substr,
5335 Py_ssize_t start,
5336 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005339 PyUnicodeObject* str_obj;
5340 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005341
Thomas Wouters477c8d52006-05-27 19:21:47 +00005342 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5343 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005345 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5346 if (!sub_obj) {
5347 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 return -1;
5349 }
Tim Petersced69f82003-09-16 20:30:58 +00005350
Thomas Wouters477c8d52006-05-27 19:21:47 +00005351 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005352
Thomas Wouters477c8d52006-05-27 19:21:47 +00005353 result = stringlib_count(
5354 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5355 );
5356
5357 Py_DECREF(sub_obj);
5358 Py_DECREF(str_obj);
5359
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return result;
5361}
5362
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005364 PyObject *sub,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005372 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005373 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005374 sub = PyUnicode_FromObject(sub);
5375 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005376 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005377 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
Tim Petersced69f82003-09-16 20:30:58 +00005379
Thomas Wouters477c8d52006-05-27 19:21:47 +00005380 if (direction > 0)
5381 result = stringlib_find_slice(
5382 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5383 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5384 start, end
5385 );
5386 else
5387 result = stringlib_rfind_slice(
5388 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5389 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5390 start, end
5391 );
5392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005394 Py_DECREF(sub);
5395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 return result;
5397}
5398
Tim Petersced69f82003-09-16 20:30:58 +00005399static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400int tailmatch(PyUnicodeObject *self,
5401 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005402 Py_ssize_t start,
5403 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 int direction)
5405{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 if (substring->length == 0)
5407 return 1;
5408
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
5411 end -= substring->length;
5412 if (end < start)
5413 return 0;
5414
5415 if (direction > 0) {
5416 if (Py_UNICODE_MATCH(self, end, substring))
5417 return 1;
5418 } else {
5419 if (Py_UNICODE_MATCH(self, start, substring))
5420 return 1;
5421 }
5422
5423 return 0;
5424}
5425
Martin v. Löwis18e16552006-02-15 17:27:45 +00005426Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428 Py_ssize_t start,
5429 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 int direction)
5431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005432 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005433
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 str = PyUnicode_FromObject(str);
5435 if (str == NULL)
5436 return -1;
5437 substr = PyUnicode_FromObject(substr);
5438 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005439 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return -1;
5441 }
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 result = tailmatch((PyUnicodeObject *)str,
5444 (PyUnicodeObject *)substr,
5445 start, end, direction);
5446 Py_DECREF(str);
5447 Py_DECREF(substr);
5448 return result;
5449}
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451/* Apply fixfct filter to the Unicode object self and return a
5452 reference to the modified object */
5453
Tim Petersced69f82003-09-16 20:30:58 +00005454static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455PyObject *fixup(PyUnicodeObject *self,
5456 int (*fixfct)(PyUnicodeObject *s))
5457{
5458
5459 PyUnicodeObject *u;
5460
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005461 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 if (u == NULL)
5463 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005464
5465 Py_UNICODE_COPY(u->str, self->str, self->length);
5466
Tim Peters7a29bd52001-09-12 03:03:31 +00005467 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 /* fixfct should return TRUE if it modified the buffer. If
5469 FALSE, return a reference to the original buffer instead
5470 (to save space, not time) */
5471 Py_INCREF(self);
5472 Py_DECREF(u);
5473 return (PyObject*) self;
5474 }
5475 return (PyObject*) u;
5476}
5477
Tim Petersced69f82003-09-16 20:30:58 +00005478static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479int fixupper(PyUnicodeObject *self)
5480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 Py_UNICODE *s = self->str;
5483 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005484
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 while (len-- > 0) {
5486 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005487
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 ch = Py_UNICODE_TOUPPER(*s);
5489 if (ch != *s) {
5490 status = 1;
5491 *s = ch;
5492 }
5493 s++;
5494 }
5495
5496 return status;
5497}
5498
Tim Petersced69f82003-09-16 20:30:58 +00005499static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500int fixlower(PyUnicodeObject *self)
5501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 Py_UNICODE *s = self->str;
5504 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005505
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 while (len-- > 0) {
5507 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005508
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 ch = Py_UNICODE_TOLOWER(*s);
5510 if (ch != *s) {
5511 status = 1;
5512 *s = ch;
5513 }
5514 s++;
5515 }
5516
5517 return status;
5518}
5519
Tim Petersced69f82003-09-16 20:30:58 +00005520static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521int fixswapcase(PyUnicodeObject *self)
5522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005523 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 Py_UNICODE *s = self->str;
5525 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 while (len-- > 0) {
5528 if (Py_UNICODE_ISUPPER(*s)) {
5529 *s = Py_UNICODE_TOLOWER(*s);
5530 status = 1;
5531 } else if (Py_UNICODE_ISLOWER(*s)) {
5532 *s = Py_UNICODE_TOUPPER(*s);
5533 status = 1;
5534 }
5535 s++;
5536 }
5537
5538 return status;
5539}
5540
Tim Petersced69f82003-09-16 20:30:58 +00005541static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542int fixcapitalize(PyUnicodeObject *self)
5543{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005544 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005545 Py_UNICODE *s = self->str;
5546 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005547
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005548 if (len == 0)
5549 return 0;
5550 if (Py_UNICODE_ISLOWER(*s)) {
5551 *s = Py_UNICODE_TOUPPER(*s);
5552 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005554 s++;
5555 while (--len > 0) {
5556 if (Py_UNICODE_ISUPPER(*s)) {
5557 *s = Py_UNICODE_TOLOWER(*s);
5558 status = 1;
5559 }
5560 s++;
5561 }
5562 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563}
5564
5565static
5566int fixtitle(PyUnicodeObject *self)
5567{
5568 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5569 register Py_UNICODE *e;
5570 int previous_is_cased;
5571
5572 /* Shortcut for single character strings */
5573 if (PyUnicode_GET_SIZE(self) == 1) {
5574 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5575 if (*p != ch) {
5576 *p = ch;
5577 return 1;
5578 }
5579 else
5580 return 0;
5581 }
Tim Petersced69f82003-09-16 20:30:58 +00005582
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 e = p + PyUnicode_GET_SIZE(self);
5584 previous_is_cased = 0;
5585 for (; p < e; p++) {
5586 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 if (previous_is_cased)
5589 *p = Py_UNICODE_TOLOWER(ch);
5590 else
5591 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005592
5593 if (Py_UNICODE_ISLOWER(ch) ||
5594 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 Py_UNICODE_ISTITLE(ch))
5596 previous_is_cased = 1;
5597 else
5598 previous_is_cased = 0;
5599 }
5600 return 1;
5601}
5602
Tim Peters8ce9f162004-08-27 01:49:32 +00005603PyObject *
5604PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605{
Tim Peters8ce9f162004-08-27 01:49:32 +00005606 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005607 const Py_UNICODE blank = ' ';
5608 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005609 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005610 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005611 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5612 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5614 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005615 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005616 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005617 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Tim Peters05eba1f2004-08-27 21:32:02 +00005619 fseq = PySequence_Fast(seq, "");
5620 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005621 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005622 }
5623
Tim Peters91879ab2004-08-27 22:35:44 +00005624 /* Grrrr. A codec may be invoked to convert str objects to
5625 * Unicode, and so it's possible to call back into Python code
5626 * during PyUnicode_FromObject(), and so it's possible for a sick
5627 * codec to change the size of fseq (if seq is a list). Therefore
5628 * we have to keep refetching the size -- can't assume seqlen
5629 * is invariant.
5630 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005631 seqlen = PySequence_Fast_GET_SIZE(fseq);
5632 /* If empty sequence, return u"". */
5633 if (seqlen == 0) {
5634 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5635 goto Done;
5636 }
5637 /* If singleton sequence with an exact Unicode, return that. */
5638 if (seqlen == 1) {
5639 item = PySequence_Fast_GET_ITEM(fseq, 0);
5640 if (PyUnicode_CheckExact(item)) {
5641 Py_INCREF(item);
5642 res = (PyUnicodeObject *)item;
5643 goto Done;
5644 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005645 }
5646
Tim Peters05eba1f2004-08-27 21:32:02 +00005647 /* At least two items to join, or one that isn't exact Unicode. */
5648 if (seqlen > 1) {
5649 /* Set up sep and seplen -- they're needed. */
5650 if (separator == NULL) {
5651 sep = &blank;
5652 seplen = 1;
5653 }
5654 else {
5655 internal_separator = PyUnicode_FromObject(separator);
5656 if (internal_separator == NULL)
5657 goto onError;
5658 sep = PyUnicode_AS_UNICODE(internal_separator);
5659 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005660 /* In case PyUnicode_FromObject() mutated seq. */
5661 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005662 }
5663 }
5664
5665 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005666 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005667 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005668 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005669 res_p = PyUnicode_AS_UNICODE(res);
5670 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005671
Tim Peters05eba1f2004-08-27 21:32:02 +00005672 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005673 Py_ssize_t itemlen;
5674 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005675
5676 item = PySequence_Fast_GET_ITEM(fseq, i);
5677 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005678 if (!PyUnicode_Check(item)) {
5679 PyErr_Format(PyExc_TypeError,
5680 "sequence item %zd: expected str instance,"
5681 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005682 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005683 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005684 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005685 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005686 if (item == NULL)
5687 goto onError;
5688 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005689
Tim Peters91879ab2004-08-27 22:35:44 +00005690 /* In case PyUnicode_FromObject() mutated seq. */
5691 seqlen = PySequence_Fast_GET_SIZE(fseq);
5692
Tim Peters8ce9f162004-08-27 01:49:32 +00005693 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005695 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005696 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005697 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005698 if (i < seqlen - 1) {
5699 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005700 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005701 goto Overflow;
5702 }
5703 if (new_res_used > res_alloc) {
5704 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005705 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005706 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005707 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005708 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005709 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005710 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005711 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005713 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005714 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005716
5717 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005718 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005719 res_p += itemlen;
5720 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005721 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005722 res_p += seplen;
5723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005725 res_used = new_res_used;
5726 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005727
Tim Peters05eba1f2004-08-27 21:32:02 +00005728 /* Shrink res to match the used area; this probably can't fail,
5729 * but it's cheap to check.
5730 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005731 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005732 goto onError;
5733
5734 Done:
5735 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005736 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 return (PyObject *)res;
5738
Tim Peters8ce9f162004-08-27 01:49:32 +00005739 Overflow:
5740 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005741 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005742 Py_DECREF(item);
5743 /* fall through */
5744
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005746 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005747 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005748 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 return NULL;
5750}
5751
Tim Petersced69f82003-09-16 20:30:58 +00005752static
5753PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t left,
5755 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 Py_UNICODE fill)
5757{
5758 PyUnicodeObject *u;
5759
5760 if (left < 0)
5761 left = 0;
5762 if (right < 0)
5763 right = 0;
5764
Tim Peters7a29bd52001-09-12 03:03:31 +00005765 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 Py_INCREF(self);
5767 return self;
5768 }
5769
5770 u = _PyUnicode_New(left + self->length + right);
5771 if (u) {
5772 if (left)
5773 Py_UNICODE_FILL(u->str, fill, left);
5774 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5775 if (right)
5776 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5777 }
5778
5779 return u;
5780}
5781
5782#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005783 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 if (!str) \
5785 goto onError; \
5786 if (PyList_Append(list, str)) { \
5787 Py_DECREF(str); \
5788 goto onError; \
5789 } \
5790 else \
5791 Py_DECREF(str);
5792
5793static
5794PyObject *split_whitespace(PyUnicodeObject *self,
5795 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 register Py_ssize_t i;
5799 register Py_ssize_t j;
5800 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005802 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
5804 for (i = j = 0; i < len; ) {
5805 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005806 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 i++;
5808 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005809 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 i++;
5811 if (j < i) {
5812 if (maxcount-- <= 0)
5813 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005814 SPLIT_APPEND(buf, j, i);
5815 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 i++;
5817 j = i;
5818 }
5819 }
5820 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005821 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 }
5823 return list;
5824
5825 onError:
5826 Py_DECREF(list);
5827 return NULL;
5828}
5829
5830PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005831 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 PyObject *list;
5837 PyObject *str;
5838 Py_UNICODE *data;
5839
5840 string = PyUnicode_FromObject(string);
5841 if (string == NULL)
5842 return NULL;
5843 data = PyUnicode_AS_UNICODE(string);
5844 len = PyUnicode_GET_SIZE(string);
5845
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 list = PyList_New(0);
5847 if (!list)
5848 goto onError;
5849
5850 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005854 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
5857 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005858 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 if (i < len) {
5860 if (data[i] == '\r' && i + 1 < len &&
5861 data[i+1] == '\n')
5862 i += 2;
5863 else
5864 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005865 if (keepends)
5866 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Guido van Rossum86662912000-04-11 15:38:46 +00005868 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 j = i;
5870 }
5871 if (j < len) {
5872 SPLIT_APPEND(data, j, len);
5873 }
5874
5875 Py_DECREF(string);
5876 return list;
5877
5878 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005879 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 Py_DECREF(string);
5881 return NULL;
5882}
5883
Tim Petersced69f82003-09-16 20:30:58 +00005884static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885PyObject *split_char(PyUnicodeObject *self,
5886 PyObject *list,
5887 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005888 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005890 register Py_ssize_t i;
5891 register Py_ssize_t j;
5892 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005894 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
5896 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005897 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 if (maxcount-- <= 0)
5899 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005900 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 i = j = i + 1;
5902 } else
5903 i++;
5904 }
5905 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005906 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
5908 return list;
5909
5910 onError:
5911 Py_DECREF(list);
5912 return NULL;
5913}
5914
Tim Petersced69f82003-09-16 20:30:58 +00005915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916PyObject *split_substring(PyUnicodeObject *self,
5917 PyObject *list,
5918 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 register Py_ssize_t i;
5922 register Py_ssize_t j;
5923 Py_ssize_t len = self->length;
5924 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 PyObject *str;
5926
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005927 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (Py_UNICODE_MATCH(self, i, substring)) {
5929 if (maxcount-- <= 0)
5930 break;
5931 SPLIT_APPEND(self->str, j, i);
5932 i = j = i + sublen;
5933 } else
5934 i++;
5935 }
5936 if (j <= len) {
5937 SPLIT_APPEND(self->str, j, len);
5938 }
5939 return list;
5940
5941 onError:
5942 Py_DECREF(list);
5943 return NULL;
5944}
5945
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946static
5947PyObject *rsplit_whitespace(PyUnicodeObject *self,
5948 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005949 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005951 register Py_ssize_t i;
5952 register Py_ssize_t j;
5953 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005954 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005955 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005956
5957 for (i = j = len - 1; i >= 0; ) {
5958 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005959 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005960 i--;
5961 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005962 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005963 i--;
5964 if (j > i) {
5965 if (maxcount-- <= 0)
5966 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005967 SPLIT_APPEND(buf, i + 1, j + 1);
5968 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005969 i--;
5970 j = i;
5971 }
5972 }
5973 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005974 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005975 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005976 if (PyList_Reverse(list) < 0)
5977 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005978 return list;
5979
5980 onError:
5981 Py_DECREF(list);
5982 return NULL;
5983}
5984
5985static
5986PyObject *rsplit_char(PyUnicodeObject *self,
5987 PyObject *list,
5988 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005989 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005990{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005991 register Py_ssize_t i;
5992 register Py_ssize_t j;
5993 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005995 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005996
5997 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005998 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005999 if (maxcount-- <= 0)
6000 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006001 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006002 j = i = i - 1;
6003 } else
6004 i--;
6005 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006006 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006007 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006008 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 if (PyList_Reverse(list) < 0)
6010 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006011 return list;
6012
6013 onError:
6014 Py_DECREF(list);
6015 return NULL;
6016}
6017
6018static
6019PyObject *rsplit_substring(PyUnicodeObject *self,
6020 PyObject *list,
6021 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 register Py_ssize_t i;
6025 register Py_ssize_t j;
6026 Py_ssize_t len = self->length;
6027 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006028 PyObject *str;
6029
6030 for (i = len - sublen, j = len; i >= 0; ) {
6031 if (Py_UNICODE_MATCH(self, i, substring)) {
6032 if (maxcount-- <= 0)
6033 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006034 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006035 j = i;
6036 i -= sublen;
6037 } else
6038 i--;
6039 }
6040 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006042 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043 if (PyList_Reverse(list) < 0)
6044 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006045 return list;
6046
6047 onError:
6048 Py_DECREF(list);
6049 return NULL;
6050}
6051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052#undef SPLIT_APPEND
6053
6054static
6055PyObject *split(PyUnicodeObject *self,
6056 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
6059 PyObject *list;
6060
6061 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006062 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
6064 list = PyList_New(0);
6065 if (!list)
6066 return NULL;
6067
6068 if (substring == NULL)
6069 return split_whitespace(self,list,maxcount);
6070
6071 else if (substring->length == 1)
6072 return split_char(self,list,substring->str[0],maxcount);
6073
6074 else if (substring->length == 0) {
6075 Py_DECREF(list);
6076 PyErr_SetString(PyExc_ValueError, "empty separator");
6077 return NULL;
6078 }
6079 else
6080 return split_substring(self,list,substring,maxcount);
6081}
6082
Tim Petersced69f82003-09-16 20:30:58 +00006083static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006084PyObject *rsplit(PyUnicodeObject *self,
6085 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006086 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006087{
6088 PyObject *list;
6089
6090 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006091 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006092
6093 list = PyList_New(0);
6094 if (!list)
6095 return NULL;
6096
6097 if (substring == NULL)
6098 return rsplit_whitespace(self,list,maxcount);
6099
6100 else if (substring->length == 1)
6101 return rsplit_char(self,list,substring->str[0],maxcount);
6102
6103 else if (substring->length == 0) {
6104 Py_DECREF(list);
6105 PyErr_SetString(PyExc_ValueError, "empty separator");
6106 return NULL;
6107 }
6108 else
6109 return rsplit_substring(self,list,substring,maxcount);
6110}
6111
6112static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113PyObject *replace(PyUnicodeObject *self,
6114 PyUnicodeObject *str1,
6115 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006116 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
6118 PyUnicodeObject *u;
6119
6120 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006121 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123 if (str1->length == str2->length) {
6124 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006125 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006126 if (str1->length == 1) {
6127 /* replace characters */
6128 Py_UNICODE u1, u2;
6129 if (!findchar(self->str, self->length, str1->str[0]))
6130 goto nothing;
6131 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6132 if (!u)
6133 return NULL;
6134 Py_UNICODE_COPY(u->str, self->str, self->length);
6135 u1 = str1->str[0];
6136 u2 = str2->str[0];
6137 for (i = 0; i < u->length; i++)
6138 if (u->str[i] == u1) {
6139 if (--maxcount < 0)
6140 break;
6141 u->str[i] = u2;
6142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006144 i = fastsearch(
6145 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006147 if (i < 0)
6148 goto nothing;
6149 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6150 if (!u)
6151 return NULL;
6152 Py_UNICODE_COPY(u->str, self->str, self->length);
6153 while (i <= self->length - str1->length)
6154 if (Py_UNICODE_MATCH(self, i, str1)) {
6155 if (--maxcount < 0)
6156 break;
6157 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6158 i += str1->length;
6159 } else
6160 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163
6164 Py_ssize_t n, i, j, e;
6165 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 Py_UNICODE *p;
6167
6168 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006169 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 if (n > maxcount)
6171 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006172 if (n == 0)
6173 goto nothing;
6174 /* new_size = self->length + n * (str2->length - str1->length)); */
6175 delta = (str2->length - str1->length);
6176 if (delta == 0) {
6177 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179 product = n * (str2->length - str1->length);
6180 if ((product / (str2->length - str1->length)) != n) {
6181 PyErr_SetString(PyExc_OverflowError,
6182 "replace string is too long");
6183 return NULL;
6184 }
6185 new_size = self->length + product;
6186 if (new_size < 0) {
6187 PyErr_SetString(PyExc_OverflowError,
6188 "replace string is too long");
6189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 }
6191 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006192 u = _PyUnicode_New(new_size);
6193 if (!u)
6194 return NULL;
6195 i = 0;
6196 p = u->str;
6197 e = self->length - str1->length;
6198 if (str1->length > 0) {
6199 while (n-- > 0) {
6200 /* look for next match */
6201 j = i;
6202 while (j <= e) {
6203 if (Py_UNICODE_MATCH(self, j, str1))
6204 break;
6205 j++;
6206 }
6207 if (j > i) {
6208 if (j > e)
6209 break;
6210 /* copy unchanged part [i:j] */
6211 Py_UNICODE_COPY(p, self->str+i, j-i);
6212 p += j - i;
6213 }
6214 /* copy substitution string */
6215 if (str2->length > 0) {
6216 Py_UNICODE_COPY(p, str2->str, str2->length);
6217 p += str2->length;
6218 }
6219 i = j + str1->length;
6220 }
6221 if (i < self->length)
6222 /* copy tail [i:] */
6223 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6224 } else {
6225 /* interleave */
6226 while (n > 0) {
6227 Py_UNICODE_COPY(p, str2->str, str2->length);
6228 p += str2->length;
6229 if (--n <= 0)
6230 break;
6231 *p++ = self->str[i++];
6232 }
6233 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006237
6238nothing:
6239 /* nothing to replace; return original string (when possible) */
6240 if (PyUnicode_CheckExact(self)) {
6241 Py_INCREF(self);
6242 return (PyObject *) self;
6243 }
6244 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245}
6246
6247/* --- Unicode Object Methods --------------------------------------------- */
6248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006249PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006250"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251\n\
6252Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006253characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
6255static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006256unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 return fixup(self, fixtitle);
6259}
6260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006261PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006262"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263\n\
6264Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
6267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006268unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return fixup(self, fixcapitalize);
6271}
6272
6273#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006274PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006275"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276\n\
6277Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006278normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
6280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006281unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282{
6283 PyObject *list;
6284 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006285 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 /* Split into words */
6288 list = split(self, NULL, -1);
6289 if (!list)
6290 return NULL;
6291
6292 /* Capitalize each word */
6293 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6294 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6295 fixcapitalize);
6296 if (item == NULL)
6297 goto onError;
6298 Py_DECREF(PyList_GET_ITEM(list, i));
6299 PyList_SET_ITEM(list, i, item);
6300 }
6301
6302 /* Join the words to form a new string */
6303 item = PyUnicode_Join(NULL, list);
6304
6305onError:
6306 Py_DECREF(list);
6307 return (PyObject *)item;
6308}
6309#endif
6310
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006311/* Argument converter. Coerces to a single unicode character */
6312
6313static int
6314convert_uc(PyObject *obj, void *addr)
6315{
6316 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6317 PyObject *uniobj;
6318 Py_UNICODE *unistr;
6319
6320 uniobj = PyUnicode_FromObject(obj);
6321 if (uniobj == NULL) {
6322 PyErr_SetString(PyExc_TypeError,
6323 "The fill character cannot be converted to Unicode");
6324 return 0;
6325 }
6326 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6327 PyErr_SetString(PyExc_TypeError,
6328 "The fill character must be exactly one character long");
6329 Py_DECREF(uniobj);
6330 return 0;
6331 }
6332 unistr = PyUnicode_AS_UNICODE(uniobj);
6333 *fillcharloc = unistr[0];
6334 Py_DECREF(uniobj);
6335 return 1;
6336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006339"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006341Return S centered in a Unicode string of length width. Padding is\n\
6342done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject *
6345unicode_center(PyUnicodeObject *self, PyObject *args)
6346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347 Py_ssize_t marg, left;
6348 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006349 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
Thomas Woutersde017742006-02-16 19:34:37 +00006351 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 return NULL;
6353
Tim Peters7a29bd52001-09-12 03:03:31 +00006354 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 Py_INCREF(self);
6356 return (PyObject*) self;
6357 }
6358
6359 marg = width - self->length;
6360 left = marg / 2 + (marg & width & 1);
6361
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006362 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363}
6364
Marc-André Lemburge5034372000-08-08 08:04:29 +00006365#if 0
6366
6367/* This code should go into some future Unicode collation support
6368 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006369 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006370
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006371/* speedy UTF-16 code point order comparison */
6372/* gleaned from: */
6373/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6374
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006375static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006376{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006377 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006378 0, 0, 0, 0, 0, 0, 0, 0,
6379 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006380 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006381};
6382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383static int
6384unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006386 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 Py_UNICODE *s1 = str1->str;
6389 Py_UNICODE *s2 = str2->str;
6390
6391 len1 = str1->length;
6392 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006393
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006395 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006396
6397 c1 = *s1++;
6398 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006399
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006400 if (c1 > (1<<11) * 26)
6401 c1 += utf16Fixup[c1>>11];
6402 if (c2 > (1<<11) * 26)
6403 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006404 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006405
6406 if (c1 != c2)
6407 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006408
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006409 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 }
6411
6412 return (len1 < len2) ? -1 : (len1 != len2);
6413}
6414
Marc-André Lemburge5034372000-08-08 08:04:29 +00006415#else
6416
6417static int
6418unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006420 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006421
6422 Py_UNICODE *s1 = str1->str;
6423 Py_UNICODE *s2 = str2->str;
6424
6425 len1 = str1->length;
6426 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006427
Marc-André Lemburge5034372000-08-08 08:04:29 +00006428 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006429 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006430
Fredrik Lundh45714e92001-06-26 16:39:36 +00006431 c1 = *s1++;
6432 c2 = *s2++;
6433
6434 if (c1 != c2)
6435 return (c1 < c2) ? -1 : 1;
6436
Marc-André Lemburge5034372000-08-08 08:04:29 +00006437 len1--; len2--;
6438 }
6439
6440 return (len1 < len2) ? -1 : (len1 != len2);
6441}
6442
6443#endif
6444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445int PyUnicode_Compare(PyObject *left,
6446 PyObject *right)
6447{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006448 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6449 return unicode_compare((PyUnicodeObject *)left,
6450 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006451 PyErr_Format(PyExc_TypeError,
6452 "Can't compare %.100s and %.100s",
6453 left->ob_type->tp_name,
6454 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 return -1;
6456}
6457
Martin v. Löwis5b222132007-06-10 09:51:05 +00006458int
6459PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6460{
6461 int i;
6462 Py_UNICODE *id;
6463 assert(PyUnicode_Check(uni));
6464 id = PyUnicode_AS_UNICODE(uni);
6465 /* Compare Unicode string and source character set string */
6466 for (i = 0; id[i] && str[i]; i++)
6467 if (id[i] != str[i])
6468 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6469 if (id[i])
6470 return 1; /* uni is longer */
6471 if (str[i])
6472 return -1; /* str is longer */
6473 return 0;
6474}
6475
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006476PyObject *PyUnicode_RichCompare(PyObject *left,
6477 PyObject *right,
6478 int op)
6479{
6480 int result;
6481
6482 result = PyUnicode_Compare(left, right);
6483 if (result == -1 && PyErr_Occurred())
6484 goto onError;
6485
6486 /* Convert the return value to a Boolean */
6487 switch (op) {
6488 case Py_EQ:
6489 result = (result == 0);
6490 break;
6491 case Py_NE:
6492 result = (result != 0);
6493 break;
6494 case Py_LE:
6495 result = (result <= 0);
6496 break;
6497 case Py_GE:
6498 result = (result >= 0);
6499 break;
6500 case Py_LT:
6501 result = (result == -1);
6502 break;
6503 case Py_GT:
6504 result = (result == 1);
6505 break;
6506 }
6507 return PyBool_FromLong(result);
6508
6509 onError:
6510
6511 /* Standard case
6512
6513 Type errors mean that PyUnicode_FromObject() could not convert
6514 one of the arguments (usually the right hand side) to Unicode,
6515 ie. we can't handle the comparison request. However, it is
6516 possible that the other object knows a comparison method, which
6517 is why we return Py_NotImplemented to give the other object a
6518 chance.
6519
6520 */
6521 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6522 PyErr_Clear();
6523 Py_INCREF(Py_NotImplemented);
6524 return Py_NotImplemented;
6525 }
6526 if (op != Py_EQ && op != Py_NE)
6527 return NULL;
6528
6529 /* Equality comparison.
6530
6531 This is a special case: we silence any PyExc_UnicodeDecodeError
6532 and instead turn it into a PyErr_UnicodeWarning.
6533
6534 */
6535 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6536 return NULL;
6537 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006538 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6539 (op == Py_EQ) ?
6540 "Unicode equal comparison "
6541 "failed to convert both arguments to Unicode - "
6542 "interpreting them as being unequal"
6543 :
6544 "Unicode unequal comparison "
6545 "failed to convert both arguments to Unicode - "
6546 "interpreting them as being unequal",
6547 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006548 return NULL;
6549 result = (op == Py_NE);
6550 return PyBool_FromLong(result);
6551}
6552
Guido van Rossum403d68b2000-03-13 15:55:09 +00006553int PyUnicode_Contains(PyObject *container,
6554 PyObject *element)
6555{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006556 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006557 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006558
6559 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560 sub = PyUnicode_FromObject(element);
6561 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006562 PyErr_Format(PyExc_TypeError,
6563 "'in <string>' requires string as left operand, not %s",
6564 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006566 }
6567
Thomas Wouters477c8d52006-05-27 19:21:47 +00006568 str = PyUnicode_FromObject(container);
6569 if (!str) {
6570 Py_DECREF(sub);
6571 return -1;
6572 }
6573
6574 result = stringlib_contains_obj(str, sub);
6575
6576 Py_DECREF(str);
6577 Py_DECREF(sub);
6578
Guido van Rossum403d68b2000-03-13 15:55:09 +00006579 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006580}
6581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582/* Concat to string or Unicode object giving a new Unicode object. */
6583
6584PyObject *PyUnicode_Concat(PyObject *left,
6585 PyObject *right)
6586{
6587 PyUnicodeObject *u = NULL, *v = NULL, *w;
6588
6589 /* Coerce the two arguments */
6590 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6591 if (u == NULL)
6592 goto onError;
6593 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6594 if (v == NULL)
6595 goto onError;
6596
6597 /* Shortcuts */
6598 if (v == unicode_empty) {
6599 Py_DECREF(v);
6600 return (PyObject *)u;
6601 }
6602 if (u == unicode_empty) {
6603 Py_DECREF(u);
6604 return (PyObject *)v;
6605 }
6606
6607 /* Concat the two Unicode strings */
6608 w = _PyUnicode_New(u->length + v->length);
6609 if (w == NULL)
6610 goto onError;
6611 Py_UNICODE_COPY(w->str, u->str, u->length);
6612 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6613
6614 Py_DECREF(u);
6615 Py_DECREF(v);
6616 return (PyObject *)w;
6617
6618onError:
6619 Py_XDECREF(u);
6620 Py_XDECREF(v);
6621 return NULL;
6622}
6623
Walter Dörwald1ab83302007-05-18 17:15:44 +00006624void
6625PyUnicode_Append(PyObject **pleft, PyObject *right)
6626{
6627 PyObject *new;
6628 if (*pleft == NULL)
6629 return;
6630 if (right == NULL || !PyUnicode_Check(*pleft)) {
6631 Py_DECREF(*pleft);
6632 *pleft = NULL;
6633 return;
6634 }
6635 new = PyUnicode_Concat(*pleft, right);
6636 Py_DECREF(*pleft);
6637 *pleft = new;
6638}
6639
6640void
6641PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6642{
6643 PyUnicode_Append(pleft, right);
6644 Py_XDECREF(right);
6645}
6646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648"S.count(sub[, start[, end]]) -> int\n\
6649\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006650Return the number of non-overlapping occurrences of substring sub in\n\
6651Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006652interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654static PyObject *
6655unicode_count(PyUnicodeObject *self, PyObject *args)
6656{
6657 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006658 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006659 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 PyObject *result;
6661
Guido van Rossumb8872e62000-05-09 14:14:27 +00006662 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6663 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 return NULL;
6665
6666 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006667 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (substring == NULL)
6669 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Thomas Wouters477c8d52006-05-27 19:21:47 +00006671 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
Christian Heimes217cfd12007-12-02 14:31:20 +00006673 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006674 stringlib_count(self->str + start, end - start,
6675 substring->str, substring->length)
6676 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return result;
6681}
6682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006684"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006686Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006687to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006688handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6690'xmlcharrefreplace' as well as any other name registered with\n\
6691codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693static PyObject *
6694unicode_encode(PyUnicodeObject *self, PyObject *args)
6695{
6696 char *encoding = NULL;
6697 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006698 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6701 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006702 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006703 if (v == NULL)
6704 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006705 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006706 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006707 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006708 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006709 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006710 Py_DECREF(v);
6711 return NULL;
6712 }
6713 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006714
6715 onError:
6716 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006720"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721\n\
6722Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
6726unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6727{
6728 Py_UNICODE *e;
6729 Py_UNICODE *p;
6730 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006731 Py_UNICODE *qe;
6732 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 PyUnicodeObject *u;
6734 int tabsize = 8;
6735
6736 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6737 return NULL;
6738
Thomas Wouters7e474022000-07-16 12:04:32 +00006739 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006740 i = 0; /* chars up to and including most recent \n or \r */
6741 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6742 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 for (p = self->str; p < e; p++)
6744 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006745 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006746 incr = tabsize - (j % tabsize); /* cannot overflow */
6747 if (j > PY_SSIZE_T_MAX - incr)
6748 goto overflow1;
6749 j += incr;
6750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
6752 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006753 if (j > PY_SSIZE_T_MAX - 1)
6754 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 j++;
6756 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006757 if (i > PY_SSIZE_T_MAX - j)
6758 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006760 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 }
6762 }
6763
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006764 if (i > PY_SSIZE_T_MAX - j)
6765 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 /* Second pass: create output string and fill it */
6768 u = _PyUnicode_New(i + j);
6769 if (!u)
6770 return NULL;
6771
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006772 j = 0; /* same as in first pass */
6773 q = u->str; /* next output char */
6774 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
6776 for (p = self->str; p < e; p++)
6777 if (*p == '\t') {
6778 if (tabsize > 0) {
6779 i = tabsize - (j % tabsize);
6780 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006781 while (i--) {
6782 if (q >= qe)
6783 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
6787 }
6788 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006789 if (q >= qe)
6790 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006792 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 if (*p == '\n' || *p == '\r')
6794 j = 0;
6795 }
6796
6797 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006798
6799 overflow2:
6800 Py_DECREF(u);
6801 overflow1:
6802 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006807"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808\n\
6809Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006810such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811arguments start and end are interpreted as in slice notation.\n\
6812\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
6815static PyObject *
6816unicode_find(PyUnicodeObject *self, PyObject *args)
6817{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006818 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006819 Py_ssize_t start;
6820 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006821 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Christian Heimes9cd17752007-11-18 19:35:23 +00006823 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
Thomas Wouters477c8d52006-05-27 19:21:47 +00006826 result = stringlib_find_slice(
6827 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6828 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6829 start, end
6830 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006833
Christian Heimes217cfd12007-12-02 14:31:20 +00006834 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
6837static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006838unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
6840 if (index < 0 || index >= self->length) {
6841 PyErr_SetString(PyExc_IndexError, "string index out of range");
6842 return NULL;
6843 }
6844
6845 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6846}
6847
Guido van Rossumc2504932007-09-18 19:42:40 +00006848/* Believe it or not, this produces the same value for ASCII strings
6849 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006851unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852{
Guido van Rossumc2504932007-09-18 19:42:40 +00006853 Py_ssize_t len;
6854 Py_UNICODE *p;
6855 long x;
6856
6857 if (self->hash != -1)
6858 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006859 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006860 p = self->str;
6861 x = *p << 7;
6862 while (--len >= 0)
6863 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006864 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006865 if (x == -1)
6866 x = -2;
6867 self->hash = x;
6868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006872"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
6876static PyObject *
6877unicode_index(PyUnicodeObject *self, PyObject *args)
6878{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006880 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006881 Py_ssize_t start;
6882 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
Christian Heimes9cd17752007-11-18 19:35:23 +00006884 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887 result = stringlib_find_slice(
6888 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6889 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6890 start, end
6891 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
6893 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 if (result < 0) {
6896 PyErr_SetString(PyExc_ValueError, "substring not found");
6897 return NULL;
6898 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899
Christian Heimes217cfd12007-12-02 14:31:20 +00006900 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006910unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911{
6912 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6913 register const Py_UNICODE *e;
6914 int cased;
6915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006920 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006921 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 e = p + PyUnicode_GET_SIZE(self);
6925 cased = 0;
6926 for (; p < e; p++) {
6927 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 else if (!cased && Py_UNICODE_ISLOWER(ch))
6932 cased = 1;
6933 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935}
6936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006940Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
6943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006944unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945{
6946 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6947 register const Py_UNICODE *e;
6948 int cased;
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006954 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006955 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006957
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 e = p + PyUnicode_GET_SIZE(self);
6959 cased = 0;
6960 for (; p < e; p++) {
6961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006962
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 else if (!cased && Py_UNICODE_ISUPPER(ch))
6966 cased = 1;
6967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969}
6970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006974Return True if S is a titlecased string and there is at least one\n\
6975character in S, i.e. upper- and titlecase characters may only\n\
6976follow uncased characters and lowercase characters only cased ones.\n\
6977Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006980unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
6982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6983 register const Py_UNICODE *e;
6984 int cased, previous_is_cased;
6985
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 /* Shortcut for single character strings */
6987 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6989 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006991 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006992 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006994
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 e = p + PyUnicode_GET_SIZE(self);
6996 cased = 0;
6997 previous_is_cased = 0;
6998 for (; p < e; p++) {
6999 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007000
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7002 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 previous_is_cased = 1;
7005 cased = 1;
7006 }
7007 else if (Py_UNICODE_ISLOWER(ch)) {
7008 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 previous_is_cased = 1;
7011 cased = 1;
7012 }
7013 else
7014 previous_is_cased = 0;
7015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007020"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007022Return True if all characters in S are whitespace\n\
7023and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024
7025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
7028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7029 register const Py_UNICODE *e;
7030
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 /* Shortcut for single character strings */
7032 if (PyUnicode_GET_SIZE(self) == 1 &&
7033 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007036 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007037 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007039
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 e = p + PyUnicode_GET_SIZE(self);
7041 for (; p < e; p++) {
7042 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007043 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007045 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007048PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007049"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007050\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007051Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007053
7054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007055unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007056{
7057 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7058 register const Py_UNICODE *e;
7059
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007060 /* Shortcut for single character strings */
7061 if (PyUnicode_GET_SIZE(self) == 1 &&
7062 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007063 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007064
7065 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007066 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007067 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007068
7069 e = p + PyUnicode_GET_SIZE(self);
7070 for (; p < e; p++) {
7071 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007072 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007074 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007075}
7076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007077PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007078"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007079\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007080Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007082
7083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007084unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007085{
7086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7087 register const Py_UNICODE *e;
7088
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089 /* Shortcut for single character strings */
7090 if (PyUnicode_GET_SIZE(self) == 1 &&
7091 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007092 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007093
7094 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007095 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007096 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007097
7098 e = p + PyUnicode_GET_SIZE(self);
7099 for (; p < e; p++) {
7100 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007101 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007102 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007103 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007104}
7105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007106PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007107"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007109Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007110False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007113unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114{
7115 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7116 register const Py_UNICODE *e;
7117
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 /* Shortcut for single character strings */
7119 if (PyUnicode_GET_SIZE(self) == 1 &&
7120 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007123 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007124 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007126
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 e = p + PyUnicode_GET_SIZE(self);
7128 for (; p < e; p++) {
7129 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007130 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007132 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133}
7134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007136"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007138Return True if all characters in S are digits\n\
7139and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
7141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007142unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143{
7144 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7145 register const Py_UNICODE *e;
7146
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 /* Shortcut for single character strings */
7148 if (PyUnicode_GET_SIZE(self) == 1 &&
7149 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007152 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007153 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007154 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007155
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 e = p + PyUnicode_GET_SIZE(self);
7157 for (; p < e; p++) {
7158 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007159 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007161 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007164PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007165"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007167Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007168False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
7170static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007171unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172{
7173 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7174 register const Py_UNICODE *e;
7175
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 /* Shortcut for single character strings */
7177 if (PyUnicode_GET_SIZE(self) == 1 &&
7178 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007181 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007182 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007183 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007184
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 e = p + PyUnicode_GET_SIZE(self);
7186 for (; p < e; p++) {
7187 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007188 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007190 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191}
7192
Martin v. Löwis47383402007-08-15 07:32:56 +00007193int
7194PyUnicode_IsIdentifier(PyObject *self)
7195{
7196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7197 register const Py_UNICODE *e;
7198
7199 /* Special case for empty strings */
7200 if (PyUnicode_GET_SIZE(self) == 0)
7201 return 0;
7202
7203 /* PEP 3131 says that the first character must be in
7204 XID_Start and subsequent characters in XID_Continue,
7205 and for the ASCII range, the 2.x rules apply (i.e
7206 start with letters and underscore, continue with
7207 letters, digits, underscore). However, given the current
7208 definition of XID_Start and XID_Continue, it is sufficient
7209 to check just for these, except that _ must be allowed
7210 as starting an identifier. */
7211 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7212 return 0;
7213
7214 e = p + PyUnicode_GET_SIZE(self);
7215 for (p++; p < e; p++) {
7216 if (!_PyUnicode_IsXidContinue(*p))
7217 return 0;
7218 }
7219 return 1;
7220}
7221
7222PyDoc_STRVAR(isidentifier__doc__,
7223"S.isidentifier() -> bool\n\
7224\n\
7225Return True if S is a valid identifier according\n\
7226to the language definition.");
7227
7228static PyObject*
7229unicode_isidentifier(PyObject *self)
7230{
7231 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7232}
7233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007235"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236\n\
7237Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239
7240static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007241unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007243 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244}
7245
Martin v. Löwis18e16552006-02-15 17:27:45 +00007246static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247unicode_length(PyUnicodeObject *self)
7248{
7249 return self->length;
7250}
7251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007252PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007253"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254\n\
7255Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007256done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257
7258static PyObject *
7259unicode_ljust(PyUnicodeObject *self, PyObject *args)
7260{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007261 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007262 Py_UNICODE fillchar = ' ';
7263
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007264 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 return NULL;
7266
Tim Peters7a29bd52001-09-12 03:03:31 +00007267 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 Py_INCREF(self);
7269 return (PyObject*) self;
7270 }
7271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007272 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273}
7274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007275PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007276"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007281unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 return fixup(self, fixlower);
7284}
7285
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007286#define LEFTSTRIP 0
7287#define RIGHTSTRIP 1
7288#define BOTHSTRIP 2
7289
7290/* Arrays indexed by above */
7291static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7292
7293#define STRIPNAME(i) (stripformat[i]+3)
7294
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007295/* externally visible for str.strip(unicode) */
7296PyObject *
7297_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7298{
7299 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007300 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007301 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007302 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7303 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007304
Thomas Wouters477c8d52006-05-27 19:21:47 +00007305 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7306
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007307 i = 0;
7308 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007309 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7310 i++;
7311 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007312 }
7313
7314 j = len;
7315 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316 do {
7317 j--;
7318 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7319 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007320 }
7321
7322 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007323 Py_INCREF(self);
7324 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007325 }
7326 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007327 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007328}
7329
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007332do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007334 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007335 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007336
7337 i = 0;
7338 if (striptype != RIGHTSTRIP) {
7339 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7340 i++;
7341 }
7342 }
7343
7344 j = len;
7345 if (striptype != LEFTSTRIP) {
7346 do {
7347 j--;
7348 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7349 j++;
7350 }
7351
7352 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7353 Py_INCREF(self);
7354 return (PyObject*)self;
7355 }
7356 else
7357 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358}
7359
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007360
7361static PyObject *
7362do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7363{
7364 PyObject *sep = NULL;
7365
7366 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7367 return NULL;
7368
7369 if (sep != NULL && sep != Py_None) {
7370 if (PyUnicode_Check(sep))
7371 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007372 else {
7373 PyErr_Format(PyExc_TypeError,
7374 "%s arg must be None, unicode or str",
7375 STRIPNAME(striptype));
7376 return NULL;
7377 }
7378 }
7379
7380 return do_strip(self, striptype);
7381}
7382
7383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007384PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007385"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007386\n\
7387Return a copy of the string S with leading and trailing\n\
7388whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007389If chars is given and not None, remove characters in chars instead.\n\
7390If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007391
7392static PyObject *
7393unicode_strip(PyUnicodeObject *self, PyObject *args)
7394{
7395 if (PyTuple_GET_SIZE(args) == 0)
7396 return do_strip(self, BOTHSTRIP); /* Common case */
7397 else
7398 return do_argstrip(self, BOTHSTRIP, args);
7399}
7400
7401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007402PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007403"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007404\n\
7405Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007406If chars is given and not None, remove characters in chars instead.\n\
7407If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007408
7409static PyObject *
7410unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7411{
7412 if (PyTuple_GET_SIZE(args) == 0)
7413 return do_strip(self, LEFTSTRIP); /* Common case */
7414 else
7415 return do_argstrip(self, LEFTSTRIP, args);
7416}
7417
7418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007419PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007420"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007421\n\
7422Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007423If chars is given and not None, remove characters in chars instead.\n\
7424If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007425
7426static PyObject *
7427unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7428{
7429 if (PyTuple_GET_SIZE(args) == 0)
7430 return do_strip(self, RIGHTSTRIP); /* Common case */
7431 else
7432 return do_argstrip(self, RIGHTSTRIP, args);
7433}
7434
7435
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007437unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438{
7439 PyUnicodeObject *u;
7440 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007442 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
7444 if (len < 0)
7445 len = 0;
7446
Tim Peters7a29bd52001-09-12 03:03:31 +00007447 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 /* no repeat, return original string */
7449 Py_INCREF(str);
7450 return (PyObject*) str;
7451 }
Tim Peters8f422462000-09-09 06:13:41 +00007452
7453 /* ensure # of chars needed doesn't overflow int and # of bytes
7454 * needed doesn't overflow size_t
7455 */
7456 nchars = len * str->length;
7457 if (len && nchars / len != str->length) {
7458 PyErr_SetString(PyExc_OverflowError,
7459 "repeated string is too long");
7460 return NULL;
7461 }
7462 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7463 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7464 PyErr_SetString(PyExc_OverflowError,
7465 "repeated string is too long");
7466 return NULL;
7467 }
7468 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 if (!u)
7470 return NULL;
7471
7472 p = u->str;
7473
Thomas Wouters477c8d52006-05-27 19:21:47 +00007474 if (str->length == 1 && len > 0) {
7475 Py_UNICODE_FILL(p, str->str[0], len);
7476 } else {
7477 Py_ssize_t done = 0; /* number of characters copied this far */
7478 if (done < nchars) {
7479 Py_UNICODE_COPY(p, str->str, str->length);
7480 done = str->length;
7481 }
7482 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007483 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007484 Py_UNICODE_COPY(p+done, p, n);
7485 done += n;
7486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 }
7488
7489 return (PyObject*) u;
7490}
7491
7492PyObject *PyUnicode_Replace(PyObject *obj,
7493 PyObject *subobj,
7494 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007495 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496{
7497 PyObject *self;
7498 PyObject *str1;
7499 PyObject *str2;
7500 PyObject *result;
7501
7502 self = PyUnicode_FromObject(obj);
7503 if (self == NULL)
7504 return NULL;
7505 str1 = PyUnicode_FromObject(subobj);
7506 if (str1 == NULL) {
7507 Py_DECREF(self);
7508 return NULL;
7509 }
7510 str2 = PyUnicode_FromObject(replobj);
7511 if (str2 == NULL) {
7512 Py_DECREF(self);
7513 Py_DECREF(str1);
7514 return NULL;
7515 }
Tim Petersced69f82003-09-16 20:30:58 +00007516 result = replace((PyUnicodeObject *)self,
7517 (PyUnicodeObject *)str1,
7518 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 maxcount);
7520 Py_DECREF(self);
7521 Py_DECREF(str1);
7522 Py_DECREF(str2);
7523 return result;
7524}
7525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007526PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007527"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528\n\
7529Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007530old replaced by new. If the optional argument count is\n\
7531given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
7533static PyObject*
7534unicode_replace(PyUnicodeObject *self, PyObject *args)
7535{
7536 PyUnicodeObject *str1;
7537 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 PyObject *result;
7540
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 return NULL;
7543 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7544 if (str1 == NULL)
7545 return NULL;
7546 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007547 if (str2 == NULL) {
7548 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
7552 result = replace(self, str1, str2, maxcount);
7553
7554 Py_DECREF(str1);
7555 Py_DECREF(str2);
7556 return result;
7557}
7558
7559static
7560PyObject *unicode_repr(PyObject *unicode)
7561{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007562 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007563 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007564 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7565 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7566
7567 /* XXX(nnorwitz): rather than over-allocating, it would be
7568 better to choose a different scheme. Perhaps scan the
7569 first N-chars of the string and allocate based on that size.
7570 */
7571 /* Initial allocation is based on the longest-possible unichr
7572 escape.
7573
7574 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7575 unichr, so in this case it's the longest unichr escape. In
7576 narrow (UTF-16) builds this is five chars per source unichr
7577 since there are two unichrs in the surrogate pair, so in narrow
7578 (UTF-16) builds it's not the longest unichr escape.
7579
7580 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7581 so in the narrow (UTF-16) build case it's the longest unichr
7582 escape.
7583 */
7584
Walter Dörwald1ab83302007-05-18 17:15:44 +00007585 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007586 2 /* quotes */
7587#ifdef Py_UNICODE_WIDE
7588 + 10*size
7589#else
7590 + 6*size
7591#endif
7592 + 1);
7593 if (repr == NULL)
7594 return NULL;
7595
Walter Dörwald1ab83302007-05-18 17:15:44 +00007596 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007597
7598 /* Add quote */
7599 *p++ = (findchar(s, size, '\'') &&
7600 !findchar(s, size, '"')) ? '"' : '\'';
7601 while (size-- > 0) {
7602 Py_UNICODE ch = *s++;
7603
7604 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007605 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007606 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007607 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007608 continue;
7609 }
7610
Georg Brandla26f8ca2008-06-04 13:01:30 +00007611#ifdef Py_UNICODE_WIDE
7612 /* Map 21-bit characters to '\U00xxxxxx' */
7613 else if (ch >= 0x10000) {
7614 *p++ = '\\';
7615 *p++ = 'U';
7616 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7617 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7618 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7619 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7620 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7621 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7622 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7623 *p++ = hexdigits[ch & 0x0000000F];
7624 continue;
7625 }
7626#else
7627 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7628 else if (ch >= 0xD800 && ch < 0xDC00) {
7629 Py_UNICODE ch2;
7630 Py_UCS4 ucs;
7631
7632 ch2 = *s++;
7633 size--;
7634 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7635 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7636 *p++ = '\\';
7637 *p++ = 'U';
7638 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7639 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7640 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7641 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7642 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7643 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7644 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7645 *p++ = hexdigits[ucs & 0x0000000F];
7646 continue;
7647 }
7648 /* Fall through: isolated surrogates are copied as-is */
7649 s--;
7650 size++;
7651 }
7652#endif
7653
7654 /* Map 16-bit characters to '\uxxxx' */
7655 if (ch >= 256) {
7656 *p++ = '\\';
7657 *p++ = 'u';
7658 *p++ = hexdigits[(ch >> 12) & 0x000F];
7659 *p++ = hexdigits[(ch >> 8) & 0x000F];
7660 *p++ = hexdigits[(ch >> 4) & 0x000F];
7661 *p++ = hexdigits[ch & 0x000F];
7662 }
7663
7664 /* Map special whitespace to '\t', \n', '\r' */
7665 else if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007666 *p++ = '\\';
7667 *p++ = 't';
7668 }
7669 else if (ch == '\n') {
7670 *p++ = '\\';
7671 *p++ = 'n';
7672 }
7673 else if (ch == '\r') {
7674 *p++ = '\\';
7675 *p++ = 'r';
7676 }
7677
7678 /* Map non-printable US ASCII to '\xhh' */
Georg Brandla26f8ca2008-06-04 13:01:30 +00007679 else if (ch < ' ' || ch >= 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007680 *p++ = '\\';
7681 *p++ = 'x';
7682 *p++ = hexdigits[(ch >> 4) & 0x000F];
7683 *p++ = hexdigits[ch & 0x000F];
7684 }
7685
Georg Brandla26f8ca2008-06-04 13:01:30 +00007686 /* Copy everything else as-is */
7687 else
7688 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007689 }
7690 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007691 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007692
7693 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007694 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007695 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696}
7697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007698PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007699"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700\n\
7701Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007702such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703arguments start and end are interpreted as in slice notation.\n\
7704\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007705Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706
7707static PyObject *
7708unicode_rfind(PyUnicodeObject *self, PyObject *args)
7709{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007710 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007711 Py_ssize_t start;
7712 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007713 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
Christian Heimes9cd17752007-11-18 19:35:23 +00007715 if (!_ParseTupleFinds(args, &substring, &start, &end))
7716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
Thomas Wouters477c8d52006-05-27 19:21:47 +00007718 result = stringlib_rfind_slice(
7719 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7720 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7721 start, end
7722 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723
7724 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007725
Christian Heimes217cfd12007-12-02 14:31:20 +00007726 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727}
7728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007729PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007730"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007732Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
7734static PyObject *
7735unicode_rindex(PyUnicodeObject *self, PyObject *args)
7736{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007737 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007738 Py_ssize_t start;
7739 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007740 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
Christian Heimes9cd17752007-11-18 19:35:23 +00007742 if (!_ParseTupleFinds(args, &substring, &start, &end))
7743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
Thomas Wouters477c8d52006-05-27 19:21:47 +00007745 result = stringlib_rfind_slice(
7746 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7747 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7748 start, end
7749 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750
7751 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 if (result < 0) {
7754 PyErr_SetString(PyExc_ValueError, "substring not found");
7755 return NULL;
7756 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007757 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007761"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
7763Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007764done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject *
7767unicode_rjust(PyUnicodeObject *self, PyObject *args)
7768{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007769 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007770 Py_UNICODE fillchar = ' ';
7771
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007772 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 return NULL;
7774
Tim Peters7a29bd52001-09-12 03:03:31 +00007775 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 Py_INCREF(self);
7777 return (PyObject*) self;
7778 }
7779
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007780 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781}
7782
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783PyObject *PyUnicode_Split(PyObject *s,
7784 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007785 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786{
7787 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007788
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 s = PyUnicode_FromObject(s);
7790 if (s == NULL)
7791 return NULL;
7792 if (sep != NULL) {
7793 sep = PyUnicode_FromObject(sep);
7794 if (sep == NULL) {
7795 Py_DECREF(s);
7796 return NULL;
7797 }
7798 }
7799
7800 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7801
7802 Py_DECREF(s);
7803 Py_XDECREF(sep);
7804 return result;
7805}
7806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007807PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007808"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809\n\
7810Return a list of the words in S, using sep as the\n\
7811delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007812splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007813whitespace string is a separator and empty strings are\n\
7814removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
7816static PyObject*
7817unicode_split(PyUnicodeObject *self, PyObject *args)
7818{
7819 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007820 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821
Martin v. Löwis18e16552006-02-15 17:27:45 +00007822 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 return NULL;
7824
7825 if (substring == Py_None)
7826 return split(self, NULL, maxcount);
7827 else if (PyUnicode_Check(substring))
7828 return split(self, (PyUnicodeObject *)substring, maxcount);
7829 else
7830 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7831}
7832
Thomas Wouters477c8d52006-05-27 19:21:47 +00007833PyObject *
7834PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7835{
7836 PyObject* str_obj;
7837 PyObject* sep_obj;
7838 PyObject* out;
7839
7840 str_obj = PyUnicode_FromObject(str_in);
7841 if (!str_obj)
7842 return NULL;
7843 sep_obj = PyUnicode_FromObject(sep_in);
7844 if (!sep_obj) {
7845 Py_DECREF(str_obj);
7846 return NULL;
7847 }
7848
7849 out = stringlib_partition(
7850 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7851 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7852 );
7853
7854 Py_DECREF(sep_obj);
7855 Py_DECREF(str_obj);
7856
7857 return out;
7858}
7859
7860
7861PyObject *
7862PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7863{
7864 PyObject* str_obj;
7865 PyObject* sep_obj;
7866 PyObject* out;
7867
7868 str_obj = PyUnicode_FromObject(str_in);
7869 if (!str_obj)
7870 return NULL;
7871 sep_obj = PyUnicode_FromObject(sep_in);
7872 if (!sep_obj) {
7873 Py_DECREF(str_obj);
7874 return NULL;
7875 }
7876
7877 out = stringlib_rpartition(
7878 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7879 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7880 );
7881
7882 Py_DECREF(sep_obj);
7883 Py_DECREF(str_obj);
7884
7885 return out;
7886}
7887
7888PyDoc_STRVAR(partition__doc__,
7889"S.partition(sep) -> (head, sep, tail)\n\
7890\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007891Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007892the separator itself, and the part after it. If the separator is not\n\
7893found, returns S and two empty strings.");
7894
7895static PyObject*
7896unicode_partition(PyUnicodeObject *self, PyObject *separator)
7897{
7898 return PyUnicode_Partition((PyObject *)self, separator);
7899}
7900
7901PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007902"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007903\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007904Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007905the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007906separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007907
7908static PyObject*
7909unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7910{
7911 return PyUnicode_RPartition((PyObject *)self, separator);
7912}
7913
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007914PyObject *PyUnicode_RSplit(PyObject *s,
7915 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007916 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007917{
7918 PyObject *result;
7919
7920 s = PyUnicode_FromObject(s);
7921 if (s == NULL)
7922 return NULL;
7923 if (sep != NULL) {
7924 sep = PyUnicode_FromObject(sep);
7925 if (sep == NULL) {
7926 Py_DECREF(s);
7927 return NULL;
7928 }
7929 }
7930
7931 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7932
7933 Py_DECREF(s);
7934 Py_XDECREF(sep);
7935 return result;
7936}
7937
7938PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007939"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007940\n\
7941Return a list of the words in S, using sep as the\n\
7942delimiter string, starting at the end of the string and\n\
7943working to the front. If maxsplit is given, at most maxsplit\n\
7944splits are done. If sep is not specified, any whitespace string\n\
7945is a separator.");
7946
7947static PyObject*
7948unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7949{
7950 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007951 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007952
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007954 return NULL;
7955
7956 if (substring == Py_None)
7957 return rsplit(self, NULL, maxcount);
7958 else if (PyUnicode_Check(substring))
7959 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7960 else
7961 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7962}
7963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007964PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007965"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966\n\
7967Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007968Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007969is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
7971static PyObject*
7972unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7973{
Guido van Rossum86662912000-04-11 15:38:46 +00007974 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
Guido van Rossum86662912000-04-11 15:38:46 +00007976 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 return NULL;
7978
Guido van Rossum86662912000-04-11 15:38:46 +00007979 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980}
7981
7982static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007983PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984{
Walter Dörwald346737f2007-05-31 10:44:43 +00007985 if (PyUnicode_CheckExact(self)) {
7986 Py_INCREF(self);
7987 return self;
7988 } else
7989 /* Subtype -- return genuine unicode string with the same value. */
7990 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7991 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992}
7993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007994PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007995"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996\n\
7997Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007998and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999
8000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008001unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 return fixup(self, fixswapcase);
8004}
8005
Georg Brandlceee0772007-11-27 23:48:05 +00008006PyDoc_STRVAR(maketrans__doc__,
8007"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8008\n\
8009Return a translation table usable for str.translate().\n\
8010If there is only one argument, it must be a dictionary mapping Unicode\n\
8011ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8012Character keys will then be converted to ordinals.\n\
8013If there are two arguments, they must be strings of equal length, and\n\
8014in the resulting dictionary, each character in x will be mapped to the\n\
8015character at the same position in y. If there is a third argument, it\n\
8016must be a string, whose characters will be mapped to None in the result.");
8017
8018static PyObject*
8019unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8020{
8021 PyObject *x, *y = NULL, *z = NULL;
8022 PyObject *new = NULL, *key, *value;
8023 Py_ssize_t i = 0;
8024 int res;
8025
8026 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8027 return NULL;
8028 new = PyDict_New();
8029 if (!new)
8030 return NULL;
8031 if (y != NULL) {
8032 /* x must be a string too, of equal length */
8033 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8034 if (!PyUnicode_Check(x)) {
8035 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8036 "be a string if there is a second argument");
8037 goto err;
8038 }
8039 if (PyUnicode_GET_SIZE(x) != ylen) {
8040 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8041 "arguments must have equal length");
8042 goto err;
8043 }
8044 /* create entries for translating chars in x to those in y */
8045 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008046 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8047 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008048 if (!key || !value)
8049 goto err;
8050 res = PyDict_SetItem(new, key, value);
8051 Py_DECREF(key);
8052 Py_DECREF(value);
8053 if (res < 0)
8054 goto err;
8055 }
8056 /* create entries for deleting chars in z */
8057 if (z != NULL) {
8058 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008059 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008060 if (!key)
8061 goto err;
8062 res = PyDict_SetItem(new, key, Py_None);
8063 Py_DECREF(key);
8064 if (res < 0)
8065 goto err;
8066 }
8067 }
8068 } else {
8069 /* x must be a dict */
8070 if (!PyDict_Check(x)) {
8071 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8072 "to maketrans it must be a dict");
8073 goto err;
8074 }
8075 /* copy entries into the new dict, converting string keys to int keys */
8076 while (PyDict_Next(x, &i, &key, &value)) {
8077 if (PyUnicode_Check(key)) {
8078 /* convert string keys to integer keys */
8079 PyObject *newkey;
8080 if (PyUnicode_GET_SIZE(key) != 1) {
8081 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8082 "table must be of length 1");
8083 goto err;
8084 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008085 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008086 if (!newkey)
8087 goto err;
8088 res = PyDict_SetItem(new, newkey, value);
8089 Py_DECREF(newkey);
8090 if (res < 0)
8091 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008092 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008093 /* just keep integer keys */
8094 if (PyDict_SetItem(new, key, value) < 0)
8095 goto err;
8096 } else {
8097 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8098 "be strings or integers");
8099 goto err;
8100 }
8101 }
8102 }
8103 return new;
8104 err:
8105 Py_DECREF(new);
8106 return NULL;
8107}
8108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008109PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008110"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111\n\
8112Return a copy of the string S, where all characters have been mapped\n\
8113through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008114Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8115Unmapped characters are left untouched. Characters mapped to None\n\
8116are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
8118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008119unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Georg Brandlceee0772007-11-27 23:48:05 +00008121 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122}
8123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008124PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008125"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008127Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
8129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008130unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 return fixup(self, fixupper);
8133}
8134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008135PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008136"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137\n\
8138Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008139of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140
8141static PyObject *
8142unicode_zfill(PyUnicodeObject *self, PyObject *args)
8143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008144 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 PyUnicodeObject *u;
8146
Martin v. Löwis18e16552006-02-15 17:27:45 +00008147 Py_ssize_t width;
8148 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return NULL;
8150
8151 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008152 if (PyUnicode_CheckExact(self)) {
8153 Py_INCREF(self);
8154 return (PyObject*) self;
8155 }
8156 else
8157 return PyUnicode_FromUnicode(
8158 PyUnicode_AS_UNICODE(self),
8159 PyUnicode_GET_SIZE(self)
8160 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 }
8162
8163 fill = width - self->length;
8164
8165 u = pad(self, fill, 0, '0');
8166
Walter Dörwald068325e2002-04-15 13:36:47 +00008167 if (u == NULL)
8168 return NULL;
8169
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 if (u->str[fill] == '+' || u->str[fill] == '-') {
8171 /* move sign to beginning of string */
8172 u->str[0] = u->str[fill];
8173 u->str[fill] = '0';
8174 }
8175
8176 return (PyObject*) u;
8177}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178
8179#if 0
8180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008181unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182{
Christian Heimes2202f872008-02-06 14:31:34 +00008183 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184}
8185#endif
8186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008187PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008188"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008190Return True if S starts with the specified prefix, False otherwise.\n\
8191With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008192With optional end, stop comparing S at that position.\n\
8193prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
8195static PyObject *
8196unicode_startswith(PyUnicodeObject *self,
8197 PyObject *args)
8198{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008199 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008202 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008203 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008205 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008208 if (PyTuple_Check(subobj)) {
8209 Py_ssize_t i;
8210 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8211 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8212 PyTuple_GET_ITEM(subobj, i));
8213 if (substring == NULL)
8214 return NULL;
8215 result = tailmatch(self, substring, start, end, -1);
8216 Py_DECREF(substring);
8217 if (result) {
8218 Py_RETURN_TRUE;
8219 }
8220 }
8221 /* nothing matched */
8222 Py_RETURN_FALSE;
8223 }
8224 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008226 return NULL;
8227 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008229 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
8232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008233PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008234"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008236Return True if S ends with the specified suffix, False otherwise.\n\
8237With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008238With optional end, stop comparing S at that position.\n\
8239suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240
8241static PyObject *
8242unicode_endswith(PyUnicodeObject *self,
8243 PyObject *args)
8244{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008245 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008247 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008248 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008249 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008251 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8252 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008254 if (PyTuple_Check(subobj)) {
8255 Py_ssize_t i;
8256 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8257 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8258 PyTuple_GET_ITEM(subobj, i));
8259 if (substring == NULL)
8260 return NULL;
8261 result = tailmatch(self, substring, start, end, +1);
8262 Py_DECREF(substring);
8263 if (result) {
8264 Py_RETURN_TRUE;
8265 }
8266 }
8267 Py_RETURN_FALSE;
8268 }
8269 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008273 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008275 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276}
8277
Eric Smith8c663262007-08-25 02:26:07 +00008278#include "stringlib/string_format.h"
8279
8280PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008281"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008282\n\
8283");
8284
Eric Smith4a7d76d2008-05-30 18:10:19 +00008285static PyObject *
8286unicode__format__(PyObject* self, PyObject* args)
8287{
8288 PyObject *format_spec;
8289
8290 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8291 return NULL;
8292
8293 return _PyUnicode_FormatAdvanced(self,
8294 PyUnicode_AS_UNICODE(format_spec),
8295 PyUnicode_GET_SIZE(format_spec));
8296}
8297
Eric Smith8c663262007-08-25 02:26:07 +00008298PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008299"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008300\n\
8301");
8302
8303static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008304unicode__sizeof__(PyUnicodeObject *v)
8305{
8306 PyObject *res = NULL, *defsize = NULL;
8307
8308 res = PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8309 sizeof(Py_UNICODE) * (v->length + 1));
8310 if (v->defenc) {
8311 defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL);
8312 if (defsize == NULL) {
8313 Py_DECREF(res);
8314 return NULL;
8315 }
8316 res = PyNumber_Add(res, defsize);
8317 Py_DECREF(defsize);
8318 }
8319 return res;
8320}
8321
8322PyDoc_STRVAR(sizeof__doc__,
8323"S.__sizeof__() -> size of S in memory, in bytes");
8324
8325static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008326unicode_getnewargs(PyUnicodeObject *v)
8327{
8328 return Py_BuildValue("(u#)", v->str, v->length);
8329}
8330
8331
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332static PyMethodDef unicode_methods[] = {
8333
8334 /* Order is according to common usage: often used methods should
8335 appear first, since lookup is done sequentially. */
8336
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008337 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8338 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8339 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008340 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008341 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8342 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8343 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8344 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8345 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8346 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8347 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008348 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008349 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8350 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8351 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008352 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008353 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8354 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8355 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008356 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008357 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008358 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008359 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008360 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8361 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8362 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8363 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8364 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8365 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8366 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8367 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8368 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8369 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8370 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8371 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8372 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8373 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008374 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008375 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008376 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008377 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008378 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8379 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008380 {"maketrans", (PyCFunction) unicode_maketrans,
8381 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008382 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008383#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008384 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385#endif
8386
8387#if 0
8388 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008389 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390#endif
8391
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008392 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 {NULL, NULL}
8394};
8395
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008396static PyObject *
8397unicode_mod(PyObject *v, PyObject *w)
8398{
8399 if (!PyUnicode_Check(v)) {
8400 Py_INCREF(Py_NotImplemented);
8401 return Py_NotImplemented;
8402 }
8403 return PyUnicode_Format(v, w);
8404}
8405
8406static PyNumberMethods unicode_as_number = {
8407 0, /*nb_add*/
8408 0, /*nb_subtract*/
8409 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008410 unicode_mod, /*nb_remainder*/
8411};
8412
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008414 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008415 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008416 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8417 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008418 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 0, /* sq_ass_item */
8420 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008421 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422};
8423
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008424static PyObject*
8425unicode_subscript(PyUnicodeObject* self, PyObject* item)
8426{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008427 if (PyIndex_Check(item)) {
8428 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008429 if (i == -1 && PyErr_Occurred())
8430 return NULL;
8431 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008432 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008433 return unicode_getitem(self, i);
8434 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008436 Py_UNICODE* source_buf;
8437 Py_UNICODE* result_buf;
8438 PyObject* result;
8439
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008440 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008441 &start, &stop, &step, &slicelength) < 0) {
8442 return NULL;
8443 }
8444
8445 if (slicelength <= 0) {
8446 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008447 } else if (start == 0 && step == 1 && slicelength == self->length &&
8448 PyUnicode_CheckExact(self)) {
8449 Py_INCREF(self);
8450 return (PyObject *)self;
8451 } else if (step == 1) {
8452 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008453 } else {
8454 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008455 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8456 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008457
8458 if (result_buf == NULL)
8459 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008460
8461 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8462 result_buf[i] = source_buf[cur];
8463 }
Tim Petersced69f82003-09-16 20:30:58 +00008464
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008465 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008466 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008467 return result;
8468 }
8469 } else {
8470 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8471 return NULL;
8472 }
8473}
8474
8475static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008476 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008477 (binaryfunc)unicode_subscript, /* mp_subscript */
8478 (objobjargproc)0, /* mp_ass_subscript */
8479};
8480
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482/* Helpers for PyUnicode_Format() */
8483
8484static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008485getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008487 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 if (argidx < arglen) {
8489 (*p_argidx)++;
8490 if (arglen < 0)
8491 return args;
8492 else
8493 return PyTuple_GetItem(args, argidx);
8494 }
8495 PyErr_SetString(PyExc_TypeError,
8496 "not enough arguments for format string");
8497 return NULL;
8498}
8499
Martin v. Löwis18e16552006-02-15 17:27:45 +00008500static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008501strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008503 register Py_ssize_t i;
8504 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 for (i = len - 1; i >= 0; i--)
8506 buffer[i] = (Py_UNICODE) charbuffer[i];
8507
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 return len;
8509}
8510
Neal Norwitzfc76d632006-01-10 06:03:13 +00008511static int
8512doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8513{
Tim Peters15231542006-02-16 01:08:01 +00008514 Py_ssize_t result;
8515
Neal Norwitzfc76d632006-01-10 06:03:13 +00008516 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008517 result = strtounicode(buffer, (char *)buffer);
8518 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008519}
8520
Christian Heimes3fd13992008-03-21 01:05:49 +00008521#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008522static int
8523longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8524{
Tim Peters15231542006-02-16 01:08:01 +00008525 Py_ssize_t result;
8526
Neal Norwitzfc76d632006-01-10 06:03:13 +00008527 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008528 result = strtounicode(buffer, (char *)buffer);
8529 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008530}
Christian Heimes3fd13992008-03-21 01:05:49 +00008531#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008532
Guido van Rossum078151d2002-08-11 04:24:12 +00008533/* XXX To save some code duplication, formatfloat/long/int could have been
8534 shared with stringobject.c, converting from 8-bit to Unicode after the
8535 formatting is done. */
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537static int
8538formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008539 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 int flags,
8541 int prec,
8542 int type,
8543 PyObject *v)
8544{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008545 /* fmt = '%#.' + `prec` + `type`
8546 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 char fmt[20];
8548 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008549
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 x = PyFloat_AsDouble(v);
8551 if (x == -1.0 && PyErr_Occurred())
8552 return -1;
8553 if (prec < 0)
8554 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8556 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008557 /* Worst case length calc to ensure no buffer overrun:
8558
8559 'g' formats:
8560 fmt = %#.<prec>g
8561 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8562 for any double rep.)
8563 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8564
8565 'f' formats:
8566 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8567 len = 1 + 50 + 1 + prec = 52 + prec
8568
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008569 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008570 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008571
8572 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008573 if (((type == 'g' || type == 'G') &&
8574 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008575 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008576 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008577 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008578 return -1;
8579 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008580 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8581 (flags&F_ALT) ? "#" : "",
8582 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008583 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584}
8585
Tim Peters38fd5b62000-09-21 05:43:11 +00008586static PyObject*
8587formatlong(PyObject *val, int flags, int prec, int type)
8588{
8589 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008590 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008591 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008592 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008593
Christian Heimes72b710a2008-05-26 13:28:38 +00008594 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008595 if (!str)
8596 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008597 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008598 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008599 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008600}
8601
Christian Heimes3fd13992008-03-21 01:05:49 +00008602#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603static int
8604formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008605 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 int flags,
8607 int prec,
8608 int type,
8609 PyObject *v)
8610{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008611 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008612 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8613 * + 1 + 1
8614 * = 24
8615 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008616 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008617 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 long x;
8619
Christian Heimes217cfd12007-12-02 14:31:20 +00008620 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008622 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008623 if (x < 0 && type == 'u') {
8624 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008625 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008626 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8627 sign = "-";
8628 else
8629 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008631 prec = 1;
8632
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008633 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8634 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008635 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008636 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008637 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008638 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008639 return -1;
8640 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008641
8642 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008643 (type == 'x' || type == 'X' || type == 'o')) {
8644 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008645 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008646 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008647 * - when 0 is being converted, the C standard leaves off
8648 * the '0x' or '0X', which is inconsistent with other
8649 * %#x/%#X conversions and inconsistent with Python's
8650 * hex() function
8651 * - there are platforms that violate the standard and
8652 * convert 0 with the '0x' or '0X'
8653 * (Metrowerks, Compaq Tru64)
8654 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008655 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008656 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008657 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008658 * We can achieve the desired consistency by inserting our
8659 * own '0x' or '0X' prefix, and substituting %x/%X in place
8660 * of %#x/%#X.
8661 *
8662 * Note that this is the same approach as used in
8663 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008664 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008665 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8666 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008667 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008668 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008669 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8670 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008671 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008672 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008673 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008674 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008675 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008676 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677}
Christian Heimes3fd13992008-03-21 01:05:49 +00008678#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
8680static int
8681formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008682 size_t buflen,
8683 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008685 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008686 if (PyUnicode_Check(v)) {
8687 if (PyUnicode_GET_SIZE(v) != 1)
8688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 else {
8692 /* Integer input truncated to a character */
8693 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008694 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008696 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008697#ifdef Py_UNICODE_WIDE
8698 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008699 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008700 "%c arg not in range(0x110000) "
8701 "(wide Python build)");
8702 return -1;
8703 }
8704#else
8705 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008706 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008707 "%c arg not in range(0x10000) "
8708 "(narrow Python build)");
8709 return -1;
8710 }
8711#endif
8712 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 }
8714 buf[1] = '\0';
8715 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008716
8717 onError:
8718 PyErr_SetString(PyExc_TypeError,
8719 "%c requires int or char");
8720 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721}
8722
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008723/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8724
8725 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8726 chars are formatted. XXX This is a magic number. Each formatting
8727 routine does bounds checking to ensure no overflow, but a better
8728 solution may be to malloc a buffer of appropriate size for each
8729 format. For now, the current solution is sufficient.
8730*/
8731#define FORMATBUFLEN (size_t)120
8732
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733PyObject *PyUnicode_Format(PyObject *format,
8734 PyObject *args)
8735{
8736 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008737 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 int args_owned = 0;
8739 PyUnicodeObject *result = NULL;
8740 PyObject *dict = NULL;
8741 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008742
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 if (format == NULL || args == NULL) {
8744 PyErr_BadInternalCall();
8745 return NULL;
8746 }
8747 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008748 if (uformat == NULL)
8749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 fmt = PyUnicode_AS_UNICODE(uformat);
8751 fmtcnt = PyUnicode_GET_SIZE(uformat);
8752
8753 reslen = rescnt = fmtcnt + 100;
8754 result = _PyUnicode_New(reslen);
8755 if (result == NULL)
8756 goto onError;
8757 res = PyUnicode_AS_UNICODE(result);
8758
8759 if (PyTuple_Check(args)) {
8760 arglen = PyTuple_Size(args);
8761 argidx = 0;
8762 }
8763 else {
8764 arglen = -1;
8765 argidx = -2;
8766 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008767 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008768 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 dict = args;
8770
8771 while (--fmtcnt >= 0) {
8772 if (*fmt != '%') {
8773 if (--rescnt < 0) {
8774 rescnt = fmtcnt + 100;
8775 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008776 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8779 --rescnt;
8780 }
8781 *res++ = *fmt++;
8782 }
8783 else {
8784 /* Got a format specifier */
8785 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008786 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 Py_UNICODE c = '\0';
8789 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008790 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 PyObject *v = NULL;
8792 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008793 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008796 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797
8798 fmt++;
8799 if (*fmt == '(') {
8800 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008801 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 PyObject *key;
8803 int pcount = 1;
8804
8805 if (dict == NULL) {
8806 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008807 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 goto onError;
8809 }
8810 ++fmt;
8811 --fmtcnt;
8812 keystart = fmt;
8813 /* Skip over balanced parentheses */
8814 while (pcount > 0 && --fmtcnt >= 0) {
8815 if (*fmt == ')')
8816 --pcount;
8817 else if (*fmt == '(')
8818 ++pcount;
8819 fmt++;
8820 }
8821 keylen = fmt - keystart - 1;
8822 if (fmtcnt < 0 || pcount > 0) {
8823 PyErr_SetString(PyExc_ValueError,
8824 "incomplete format key");
8825 goto onError;
8826 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008827#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008828 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 then looked up since Python uses strings to hold
8830 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008831 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832 key = PyUnicode_EncodeUTF8(keystart,
8833 keylen,
8834 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008835#else
8836 key = PyUnicode_FromUnicode(keystart, keylen);
8837#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 if (key == NULL)
8839 goto onError;
8840 if (args_owned) {
8841 Py_DECREF(args);
8842 args_owned = 0;
8843 }
8844 args = PyObject_GetItem(dict, key);
8845 Py_DECREF(key);
8846 if (args == NULL) {
8847 goto onError;
8848 }
8849 args_owned = 1;
8850 arglen = -1;
8851 argidx = -2;
8852 }
8853 while (--fmtcnt >= 0) {
8854 switch (c = *fmt++) {
8855 case '-': flags |= F_LJUST; continue;
8856 case '+': flags |= F_SIGN; continue;
8857 case ' ': flags |= F_BLANK; continue;
8858 case '#': flags |= F_ALT; continue;
8859 case '0': flags |= F_ZERO; continue;
8860 }
8861 break;
8862 }
8863 if (c == '*') {
8864 v = getnextarg(args, arglen, &argidx);
8865 if (v == NULL)
8866 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008867 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 PyErr_SetString(PyExc_TypeError,
8869 "* wants int");
8870 goto onError;
8871 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008872 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008873 if (width == -1 && PyErr_Occurred())
8874 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 if (width < 0) {
8876 flags |= F_LJUST;
8877 width = -width;
8878 }
8879 if (--fmtcnt >= 0)
8880 c = *fmt++;
8881 }
8882 else if (c >= '0' && c <= '9') {
8883 width = c - '0';
8884 while (--fmtcnt >= 0) {
8885 c = *fmt++;
8886 if (c < '0' || c > '9')
8887 break;
8888 if ((width*10) / 10 != width) {
8889 PyErr_SetString(PyExc_ValueError,
8890 "width too big");
8891 goto onError;
8892 }
8893 width = width*10 + (c - '0');
8894 }
8895 }
8896 if (c == '.') {
8897 prec = 0;
8898 if (--fmtcnt >= 0)
8899 c = *fmt++;
8900 if (c == '*') {
8901 v = getnextarg(args, arglen, &argidx);
8902 if (v == NULL)
8903 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008904 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 PyErr_SetString(PyExc_TypeError,
8906 "* wants int");
8907 goto onError;
8908 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008909 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008910 if (prec == -1 && PyErr_Occurred())
8911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 if (prec < 0)
8913 prec = 0;
8914 if (--fmtcnt >= 0)
8915 c = *fmt++;
8916 }
8917 else if (c >= '0' && c <= '9') {
8918 prec = c - '0';
8919 while (--fmtcnt >= 0) {
8920 c = Py_CHARMASK(*fmt++);
8921 if (c < '0' || c > '9')
8922 break;
8923 if ((prec*10) / 10 != prec) {
8924 PyErr_SetString(PyExc_ValueError,
8925 "prec too big");
8926 goto onError;
8927 }
8928 prec = prec*10 + (c - '0');
8929 }
8930 }
8931 } /* prec */
8932 if (fmtcnt >= 0) {
8933 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 if (--fmtcnt >= 0)
8935 c = *fmt++;
8936 }
8937 }
8938 if (fmtcnt < 0) {
8939 PyErr_SetString(PyExc_ValueError,
8940 "incomplete format");
8941 goto onError;
8942 }
8943 if (c != '%') {
8944 v = getnextarg(args, arglen, &argidx);
8945 if (v == NULL)
8946 goto onError;
8947 }
8948 sign = 0;
8949 fill = ' ';
8950 switch (c) {
8951
8952 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008953 pbuf = formatbuf;
8954 /* presume that buffer length is at least 1 */
8955 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 len = 1;
8957 break;
8958
8959 case 's':
8960 case 'r':
8961 if (PyUnicode_Check(v) && c == 's') {
8962 temp = v;
8963 Py_INCREF(temp);
8964 }
8965 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008967 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 else
8969 temp = PyObject_Repr(v);
8970 if (temp == NULL)
8971 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008972 if (PyUnicode_Check(temp))
8973 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008974 else {
8975 Py_DECREF(temp);
8976 PyErr_SetString(PyExc_TypeError,
8977 "%s argument has non-string str()");
8978 goto onError;
8979 }
8980 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008981 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 len = PyUnicode_GET_SIZE(temp);
8983 if (prec >= 0 && len > prec)
8984 len = prec;
8985 break;
8986
8987 case 'i':
8988 case 'd':
8989 case 'u':
8990 case 'o':
8991 case 'x':
8992 case 'X':
8993 if (c == 'i')
8994 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008995 isnumok = 0;
8996 if (PyNumber_Check(v)) {
8997 PyObject *iobj=NULL;
8998
8999 if (PyLong_Check(v)) {
9000 iobj = v;
9001 Py_INCREF(iobj);
9002 }
9003 else {
9004 iobj = PyNumber_Long(v);
9005 }
9006 if (iobj!=NULL) {
9007 if (PyLong_Check(iobj)) {
9008 isnumok = 1;
9009 temp = formatlong(iobj, flags, prec, c);
9010 Py_DECREF(iobj);
9011 if (!temp)
9012 goto onError;
9013 pbuf = PyUnicode_AS_UNICODE(temp);
9014 len = PyUnicode_GET_SIZE(temp);
9015 sign = 1;
9016 }
9017 else {
9018 Py_DECREF(iobj);
9019 }
9020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009022 if (!isnumok) {
9023 PyErr_Format(PyExc_TypeError,
9024 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009025 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009026 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009027 }
9028 if (flags & F_ZERO)
9029 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 break;
9031
9032 case 'e':
9033 case 'E':
9034 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009035 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 case 'g':
9037 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009038 if (c == 'F')
9039 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009040 pbuf = formatbuf;
9041 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9042 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (len < 0)
9044 goto onError;
9045 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009046 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 fill = '0';
9048 break;
9049
9050 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009051 pbuf = formatbuf;
9052 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 if (len < 0)
9054 goto onError;
9055 break;
9056
9057 default:
9058 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009059 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009060 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009061 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009062 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009063 (Py_ssize_t)(fmt - 1 -
9064 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 goto onError;
9066 }
9067 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009068 if (*pbuf == '-' || *pbuf == '+') {
9069 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070 len--;
9071 }
9072 else if (flags & F_SIGN)
9073 sign = '+';
9074 else if (flags & F_BLANK)
9075 sign = ' ';
9076 else
9077 sign = 0;
9078 }
9079 if (width < len)
9080 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009081 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 reslen -= rescnt;
9083 rescnt = width + fmtcnt + 100;
9084 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009085 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009086 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009087 PyErr_NoMemory();
9088 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009089 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009090 if (_PyUnicode_Resize(&result, reslen) < 0) {
9091 Py_XDECREF(temp);
9092 goto onError;
9093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 res = PyUnicode_AS_UNICODE(result)
9095 + reslen - rescnt;
9096 }
9097 if (sign) {
9098 if (fill != ' ')
9099 *res++ = sign;
9100 rescnt--;
9101 if (width > len)
9102 width--;
9103 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009104 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009105 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009106 assert(pbuf[1] == c);
9107 if (fill != ' ') {
9108 *res++ = *pbuf++;
9109 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009110 }
Tim Petersfff53252001-04-12 18:38:48 +00009111 rescnt -= 2;
9112 width -= 2;
9113 if (width < 0)
9114 width = 0;
9115 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 if (width > len && !(flags & F_LJUST)) {
9118 do {
9119 --rescnt;
9120 *res++ = fill;
9121 } while (--width > len);
9122 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009123 if (fill == ' ') {
9124 if (sign)
9125 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009126 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009127 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009128 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009129 *res++ = *pbuf++;
9130 *res++ = *pbuf++;
9131 }
9132 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009133 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 res += len;
9135 rescnt -= len;
9136 while (--width >= len) {
9137 --rescnt;
9138 *res++ = ' ';
9139 }
9140 if (dict && (argidx < arglen) && c != '%') {
9141 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009142 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009143 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 goto onError;
9145 }
9146 Py_XDECREF(temp);
9147 } /* '%' */
9148 } /* until end */
9149 if (argidx < arglen && !dict) {
9150 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009151 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 goto onError;
9153 }
9154
Thomas Woutersa96affe2006-03-12 00:29:36 +00009155 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9156 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 if (args_owned) {
9158 Py_DECREF(args);
9159 }
9160 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 return (PyObject *)result;
9162
9163 onError:
9164 Py_XDECREF(result);
9165 Py_DECREF(uformat);
9166 if (args_owned) {
9167 Py_DECREF(args);
9168 }
9169 return NULL;
9170}
9171
Jeremy Hylton938ace62002-07-17 16:30:39 +00009172static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009173unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9174
Tim Peters6d6c1a32001-08-02 04:15:00 +00009175static PyObject *
9176unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9177{
9178 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009179 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009180 char *encoding = NULL;
9181 char *errors = NULL;
9182
Guido van Rossume023fe02001-08-30 03:12:59 +00009183 if (type != &PyUnicode_Type)
9184 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009185 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009186 kwlist, &x, &encoding, &errors))
9187 return NULL;
9188 if (x == NULL)
9189 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009190 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009191 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009192 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009193 return PyUnicode_FromEncodedObject(x, encoding, errors);
9194}
9195
Guido van Rossume023fe02001-08-30 03:12:59 +00009196static PyObject *
9197unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9198{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009199 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009200 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009201
9202 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9203 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9204 if (tmp == NULL)
9205 return NULL;
9206 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009207 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009208 if (pnew == NULL) {
9209 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009210 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009211 }
Christian Heimesb186d002008-03-18 15:15:01 +00009212 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009213 if (pnew->str == NULL) {
9214 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009215 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009216 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009217 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009218 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009219 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9220 pnew->length = n;
9221 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009222 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009223 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009224}
9225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009226PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009227"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009228\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009229Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009230encoding defaults to the current default string encoding.\n\
9231errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009232
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009233static PyObject *unicode_iter(PyObject *seq);
9234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009236 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009237 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 sizeof(PyUnicodeObject), /* tp_size */
9239 0, /* tp_itemsize */
9240 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009241 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009243 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009245 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009246 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009247 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009249 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 (hashfunc) unicode_hash, /* tp_hash*/
9251 0, /* tp_call*/
9252 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009253 PyObject_GenericGetAttr, /* tp_getattro */
9254 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009255 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009256 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9257 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009258 unicode_doc, /* tp_doc */
9259 0, /* tp_traverse */
9260 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009261 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009262 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009263 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009264 0, /* tp_iternext */
9265 unicode_methods, /* tp_methods */
9266 0, /* tp_members */
9267 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009268 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009269 0, /* tp_dict */
9270 0, /* tp_descr_get */
9271 0, /* tp_descr_set */
9272 0, /* tp_dictoffset */
9273 0, /* tp_init */
9274 0, /* tp_alloc */
9275 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009276 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277};
9278
9279/* Initialize the Unicode implementation */
9280
Thomas Wouters78890102000-07-22 19:25:51 +00009281void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009283 int i;
9284
Thomas Wouters477c8d52006-05-27 19:21:47 +00009285 /* XXX - move this array to unicodectype.c ? */
9286 Py_UNICODE linebreak[] = {
9287 0x000A, /* LINE FEED */
9288 0x000D, /* CARRIAGE RETURN */
9289 0x001C, /* FILE SEPARATOR */
9290 0x001D, /* GROUP SEPARATOR */
9291 0x001E, /* RECORD SEPARATOR */
9292 0x0085, /* NEXT LINE */
9293 0x2028, /* LINE SEPARATOR */
9294 0x2029, /* PARAGRAPH SEPARATOR */
9295 };
9296
Fred Drakee4315f52000-05-09 19:53:39 +00009297 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009298 free_list = NULL;
9299 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009301 if (!unicode_empty)
9302 return;
9303
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009304 for (i = 0; i < 256; i++)
9305 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009306 if (PyType_Ready(&PyUnicode_Type) < 0)
9307 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009308
9309 /* initialize the linebreak bloom filter */
9310 bloom_linebreak = make_bloom_mask(
9311 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9312 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009313
9314 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
9317/* Finalize the Unicode implementation */
9318
Christian Heimesa156e092008-02-16 07:38:31 +00009319int
9320PyUnicode_ClearFreeList(void)
9321{
9322 int freelist_size = numfree;
9323 PyUnicodeObject *u;
9324
9325 for (u = free_list; u != NULL;) {
9326 PyUnicodeObject *v = u;
9327 u = *(PyUnicodeObject **)u;
9328 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009329 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009330 Py_XDECREF(v->defenc);
9331 PyObject_Del(v);
9332 numfree--;
9333 }
9334 free_list = NULL;
9335 assert(numfree == 0);
9336 return freelist_size;
9337}
9338
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339void
Thomas Wouters78890102000-07-22 19:25:51 +00009340_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009342 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009344 Py_XDECREF(unicode_empty);
9345 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009346
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009347 for (i = 0; i < 256; i++) {
9348 if (unicode_latin1[i]) {
9349 Py_DECREF(unicode_latin1[i]);
9350 unicode_latin1[i] = NULL;
9351 }
9352 }
Christian Heimesa156e092008-02-16 07:38:31 +00009353 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009355
Walter Dörwald16807132007-05-25 13:52:07 +00009356void
9357PyUnicode_InternInPlace(PyObject **p)
9358{
9359 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9360 PyObject *t;
9361 if (s == NULL || !PyUnicode_Check(s))
9362 Py_FatalError(
9363 "PyUnicode_InternInPlace: unicode strings only please!");
9364 /* If it's a subclass, we don't really know what putting
9365 it in the interned dict might do. */
9366 if (!PyUnicode_CheckExact(s))
9367 return;
9368 if (PyUnicode_CHECK_INTERNED(s))
9369 return;
9370 if (interned == NULL) {
9371 interned = PyDict_New();
9372 if (interned == NULL) {
9373 PyErr_Clear(); /* Don't leave an exception */
9374 return;
9375 }
9376 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009377 /* It might be that the GetItem call fails even
9378 though the key is present in the dictionary,
9379 namely when this happens during a stack overflow. */
9380 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009381 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009382 Py_END_ALLOW_RECURSION
9383
Walter Dörwald16807132007-05-25 13:52:07 +00009384 if (t) {
9385 Py_INCREF(t);
9386 Py_DECREF(*p);
9387 *p = t;
9388 return;
9389 }
9390
Martin v. Löwis5b222132007-06-10 09:51:05 +00009391 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009392 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9393 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009394 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009395 return;
9396 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009397 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009398 /* The two references in interned are not counted by refcnt.
9399 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009400 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009401 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9402}
9403
9404void
9405PyUnicode_InternImmortal(PyObject **p)
9406{
9407 PyUnicode_InternInPlace(p);
9408 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9409 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9410 Py_INCREF(*p);
9411 }
9412}
9413
9414PyObject *
9415PyUnicode_InternFromString(const char *cp)
9416{
9417 PyObject *s = PyUnicode_FromString(cp);
9418 if (s == NULL)
9419 return NULL;
9420 PyUnicode_InternInPlace(&s);
9421 return s;
9422}
9423
9424void _Py_ReleaseInternedUnicodeStrings(void)
9425{
9426 PyObject *keys;
9427 PyUnicodeObject *s;
9428 Py_ssize_t i, n;
9429 Py_ssize_t immortal_size = 0, mortal_size = 0;
9430
9431 if (interned == NULL || !PyDict_Check(interned))
9432 return;
9433 keys = PyDict_Keys(interned);
9434 if (keys == NULL || !PyList_Check(keys)) {
9435 PyErr_Clear();
9436 return;
9437 }
9438
9439 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9440 detector, interned unicode strings are not forcibly deallocated;
9441 rather, we give them their stolen references back, and then clear
9442 and DECREF the interned dict. */
9443
9444 n = PyList_GET_SIZE(keys);
9445 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9446 n);
9447 for (i = 0; i < n; i++) {
9448 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9449 switch (s->state) {
9450 case SSTATE_NOT_INTERNED:
9451 /* XXX Shouldn't happen */
9452 break;
9453 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009454 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009455 immortal_size += s->length;
9456 break;
9457 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009458 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009459 mortal_size += s->length;
9460 break;
9461 default:
9462 Py_FatalError("Inconsistent interned string state.");
9463 }
9464 s->state = SSTATE_NOT_INTERNED;
9465 }
9466 fprintf(stderr, "total size of all interned strings: "
9467 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9468 "mortal/immortal\n", mortal_size, immortal_size);
9469 Py_DECREF(keys);
9470 PyDict_Clear(interned);
9471 Py_DECREF(interned);
9472 interned = NULL;
9473}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009474
9475
9476/********************* Unicode Iterator **************************/
9477
9478typedef struct {
9479 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009480 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009481 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9482} unicodeiterobject;
9483
9484static void
9485unicodeiter_dealloc(unicodeiterobject *it)
9486{
9487 _PyObject_GC_UNTRACK(it);
9488 Py_XDECREF(it->it_seq);
9489 PyObject_GC_Del(it);
9490}
9491
9492static int
9493unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9494{
9495 Py_VISIT(it->it_seq);
9496 return 0;
9497}
9498
9499static PyObject *
9500unicodeiter_next(unicodeiterobject *it)
9501{
9502 PyUnicodeObject *seq;
9503 PyObject *item;
9504
9505 assert(it != NULL);
9506 seq = it->it_seq;
9507 if (seq == NULL)
9508 return NULL;
9509 assert(PyUnicode_Check(seq));
9510
9511 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009512 item = PyUnicode_FromUnicode(
9513 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009514 if (item != NULL)
9515 ++it->it_index;
9516 return item;
9517 }
9518
9519 Py_DECREF(seq);
9520 it->it_seq = NULL;
9521 return NULL;
9522}
9523
9524static PyObject *
9525unicodeiter_len(unicodeiterobject *it)
9526{
9527 Py_ssize_t len = 0;
9528 if (it->it_seq)
9529 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009530 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009531}
9532
9533PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9534
9535static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009536 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9537 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009538 {NULL, NULL} /* sentinel */
9539};
9540
9541PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009542 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009543 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009544 sizeof(unicodeiterobject), /* tp_basicsize */
9545 0, /* tp_itemsize */
9546 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009547 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009548 0, /* tp_print */
9549 0, /* tp_getattr */
9550 0, /* tp_setattr */
9551 0, /* tp_compare */
9552 0, /* tp_repr */
9553 0, /* tp_as_number */
9554 0, /* tp_as_sequence */
9555 0, /* tp_as_mapping */
9556 0, /* tp_hash */
9557 0, /* tp_call */
9558 0, /* tp_str */
9559 PyObject_GenericGetAttr, /* tp_getattro */
9560 0, /* tp_setattro */
9561 0, /* tp_as_buffer */
9562 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9563 0, /* tp_doc */
9564 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9565 0, /* tp_clear */
9566 0, /* tp_richcompare */
9567 0, /* tp_weaklistoffset */
9568 PyObject_SelfIter, /* tp_iter */
9569 (iternextfunc)unicodeiter_next, /* tp_iternext */
9570 unicodeiter_methods, /* tp_methods */
9571 0,
9572};
9573
9574static PyObject *
9575unicode_iter(PyObject *seq)
9576{
9577 unicodeiterobject *it;
9578
9579 if (!PyUnicode_Check(seq)) {
9580 PyErr_BadInternalCall();
9581 return NULL;
9582 }
9583 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9584 if (it == NULL)
9585 return NULL;
9586 it->it_index = 0;
9587 Py_INCREF(seq);
9588 it->it_seq = (PyUnicodeObject *)seq;
9589 _PyObject_GC_TRACK(it);
9590 return (PyObject *)it;
9591}
9592
Martin v. Löwis5b222132007-06-10 09:51:05 +00009593size_t
9594Py_UNICODE_strlen(const Py_UNICODE *u)
9595{
9596 int res = 0;
9597 while(*u++)
9598 res++;
9599 return res;
9600}
9601
9602Py_UNICODE*
9603Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9604{
9605 Py_UNICODE *u = s1;
9606 while ((*u++ = *s2++));
9607 return s1;
9608}
9609
9610Py_UNICODE*
9611Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9612{
9613 Py_UNICODE *u = s1;
9614 while ((*u++ = *s2++))
9615 if (n-- == 0)
9616 break;
9617 return s1;
9618}
9619
9620int
9621Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9622{
9623 while (*s1 && *s2 && *s1 == *s2)
9624 s1++, s2++;
9625 if (*s1 && *s2)
9626 return (*s1 < *s2) ? -1 : +1;
9627 if (*s1)
9628 return 1;
9629 if (*s2)
9630 return -1;
9631 return 0;
9632}
9633
9634Py_UNICODE*
9635Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9636{
9637 const Py_UNICODE *p;
9638 for (p = s; *p; p++)
9639 if (*p == c)
9640 return (Py_UNICODE*)p;
9641 return NULL;
9642}
9643
9644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009645#ifdef __cplusplus
9646}
9647#endif
9648
9649
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009650/*
9651Local variables:
9652c-basic-offset: 4
9653indent-tabs-mode: nil
9654End:
9655*/